======Huffman Encoding====== [[https://en.wikipedia.org/wiki/Huffman_coding|Huffman Coding Wikipedia]] ====From Data Algo Class=== import sys import queue class Node: def __init__(self, ch, freq, left, right): self.left = left self.right = right self.ch = ch self.freq = freq class Tree: def __init__(self, root = None): self.root = root def get_root(self): return self.root def __str__(self): visit_order = list() def traverse(node): if node: # visit the node visit_order.append(node.freq) if node.left == None and node.right == None: visit_order.append('(' + node.ch + ')') # traverse left subtree traverse(node.left) # traverse right subtree traverse(node.right) traverse(self.root) return str(visit_order) def get_codes(node, code, codes_dict): if node == None: return if node.left == None and node.right == None: codes_dict[node.ch] = code return get_codes(node.left, code + '0', codes_dict) get_codes(node.right, code + '1', codes_dict) def huffman_encoding(data): # If this is an empty string, no encoding if not data: return None, None # Create a dict of the letters and their frequency freqs = {} for c in data: count = freqs.get(c, 0) freqs[c] = count + 1 # Sort the dict items ans create a lit of tuples sorted by frequency sorted_by_freq = sorted(freqs.items(), key=lambda kv: kv[1]) # Create a queue to store nodes q = queue.Queue() # Create a leaf node for each character/freq and put in queue for letter_freq in sorted_by_freq: ch = letter_freq[0] freq = letter_freq[1] q.put(Node(ch, freq, None, None)) # If there is only one character, one code, create the root if q.qsize() == 1: left = q.get() q.put(Node("", left.freq, left, None)) # Group the nodes in pairs till there is only one node left, this is the root # If there is only one character, one code, this never gets entered while q.qsize() > 1: left, right = q.get(), q.get() q.put(Node("", left.freq + right.freq, left, right)) # The root is the only one left in the queue root = q.get() # Create a dict of the codes codes = {} get_codes(root, "", codes) # Now encode encoded_data = "" for c in data: encoded_data += codes[c] return encoded_data, Tree(root) def huffman_decoding(data,tree): if data == None or tree == None: return "" node = tree.get_root() string = "" for bit in data: if bit == '0': node = node.left else: node = node.right if node.left == None and node.right == None: string += node.ch node = tree.get_root() return string def test(string): if len(string) < 40: print ("Testing: \"%s\"" % string) else: print ("Testing: long string of %d characters" % len(string)) encoded_data, tree = huffman_encoding(string) decoded_data = huffman_decoding(encoded_data, tree) """ print ("The size of the data is: {}\n".format(sys.getsizeof(a_great_sentence))) print ("The content of the data is: {}\n".format(a_great_sentence)) encoded_data, tree = huffman_encoding(a_great_sentence) print(tree) print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2)))) print ("The content of the encoded data is: {}\n".format(encoded_data)) decoded_data = huffman_decoding(encoded_data, tree) print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data))) print ("The content of the decoded data is: {}\n".format(decoded_data)) """ if decoded_data != string: print ("\t\tError: expected \"%s\", got \"%s\"" % (string, decoded_data)) # Test sentences test("The bird is the word") test("cheesech eesecheesecheesecheesecheese2") # Test long string string = "A" for i in range(0,10000): string += "B" test(string) # Long string # Edge cases test("") # Edge case: empty string test("A") # Edge case: One character only test("AAAAAAAAAAAAAAAAAAAAA") # Edge case: repeating only one character