This is an old revision of the document!
Huffman Encoding

From Data Algo Class

import sys
import queue

class Node:

    def __init__(self, ch, freq, left, right):

        self.left = left
        self.right = right
        self.ch = ch
        self.freq = freq

class Tree:

    def __init__(self, root = None):
        self.root = root

    def get_root(self):
        return self.root

    def __str__(self):

        visit_order = list()


        def traverse(node):
            if node:
                # visit the node
                visit_order.append(node.freq)

                if node.left == None and node.right == None:
                    visit_order.append('(' + node.ch + ')')

                # traverse left subtree
                traverse(node.left)

                # traverse right subtree
                traverse(node.right)



        traverse(self.root)

        return str(visit_order)

def get_codes(node, code, codes_dict):

    if node == None:
        return

    if node.left == None and node.right == None:
        codes_dict[node.ch] = code
        return

    get_codes(node.left, code + '0', codes_dict)
    get_codes(node.right, code + '1', codes_dict)

def huffman_encoding(data):

    # If this is an empty string, no encoding
    if not data:
        return None, None

    # Create a dict of the letters and their frequency
    freqs = {}
    for c in data:
        count = freqs.get(c, 0)
        freqs[c] = count + 1

    # Sort the dict items ans create a lit of tuples sorted by frequency
    sorted_by_freq = sorted(freqs.items(), key=lambda kv: kv[1])

    # Create a queue to store nodes
    q = queue.Queue()

    # Create a leaf node for each character/freq and put in queue
    for letter_freq in sorted_by_freq:
        ch = letter_freq[0]
        freq = letter_freq[1]
        q.put(Node(ch, freq, None, None))

    # If there is only one character, one code, create the root
    if q.qsize() == 1:
         left = q.get()
         q.put(Node("", left.freq, left, None))

    # Group the nodes in pairs till there is only one node left, this is the root
    # If there is only one character, one code, this never gets entered
    while q.qsize() > 1:
        left, right = q.get(), q.get()
        q.put(Node("", left.freq + right.freq, left, right))

    # The root is the only one left in the queue
    root = q.get()

    # Create a dict of the codes
    codes = {}
    get_codes(root, "", codes)

    # Now encode
    encoded_data = ""
    for c in data:
        encoded_data += codes[c]

    return encoded_data, Tree(root)



def huffman_decoding(data,tree):

    if data == None or tree == None:
        return ""

    node = tree.get_root()
    string = ""

    for bit in data:

        if bit == '0':
            node = node.left
        else:
            node = node.right

        if node.left == None and node.right == None:
            string += node.ch
            node = tree.get_root()

    return string

def test(string):

    if len(string) < 40:
        print ("Testing: \"%s\"" % string)
    else:
        print ("Testing: long string of %d characters" % len(string))
    encoded_data, tree = huffman_encoding(string)
    decoded_data = huffman_decoding(encoded_data, tree)
    """
    print ("The size of the data is: {}\n".format(sys.getsizeof(a_great_sentence)))
    print ("The content of the data is: {}\n".format(a_great_sentence))

    encoded_data, tree = huffman_encoding(a_great_sentence)
    print(tree)

    print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2))))
    print ("The content of the encoded data is: {}\n".format(encoded_data))

    decoded_data = huffman_decoding(encoded_data, tree)

    print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data)))
    print ("The content of the decoded data is: {}\n".format(decoded_data))
    """

    if decoded_data != string:
        print ("\t\tError: expected \"%s\", got \"%s\"" % (string, decoded_data))

# Test sentences
test("The bird is the word")
test("cheesech eesecheesecheesecheesecheese2")

# Test long string
string = "A"
for i in range(0,10000):
  string += "B"
test(string) # Long string

# Edge cases
test("") # Edge case: empty string
test("A") # Edge case: One character only
test("AAAAAAAAAAAAAAAAAAAAA") # Edge case: repeating only one character