This is an old revision of the document!

Huffman Encoding

From Data Algo Class

(env) ms-macbook-pro:P1 jrichards$ vi p3.py

import sys import queue

class Node:

  def __init__(self, ch, freq, left, right):

      self.left = left
      self.right = right
      self.ch = ch
      self.freq = freq

class Tree:

  def __init__(self, root = None):
      self.root = root

  def get_root(self):
      return self.root

  def __str__(self):

      visit_order = list()

      def traverse(node):
          if node:
              # visit the node
              visit_order.append(node.freq)

              if node.left == None and node.right == None:
                  visit_order.append('(' + node.ch + ')')

              # traverse left subtree
              traverse(node.left)

              # traverse right subtree
              traverse(node.right)

      traverse(self.root)

      return str(visit_order)

def get_codes(node, code, codes_dict):

  if node == None:
      return

  if node.left == None and node.right == None:
      codes_dict[node.ch] = code
      return

  get_codes(node.left, code + '0', codes_dict)
  get_codes(node.right, code + '1', codes_dict)

def huffman_encoding(data):

  # If this is an empty string, no encoding
  if not data:
      return None, None

  # Create a dict of the letters and their frequency
  freqs = {}
  for c in data:
      count = freqs.get(c, 0)
      freqs[c] = count + 1

  # Sort the dict items ans create a lit of tuples sorted by frequency
  sorted_by_freq = sorted(freqs.items(), key=lambda kv: kv[1])

  # Create a queue to store nodes
  q = queue.Queue()

  # Create a leaf node for each character/freq and put in queue
  for letter_freq in sorted_by_freq:
      ch = letter_freq[0]
      freq = letter_freq[1]
      q.put(Node(ch, freq, None, None))

  # If there is only one character, one code, create the root
  if q.qsize() == 1:
       left = q.get()
       q.put(Node("", left.freq, left, None))

  # Group the nodes in pairs till there is only one node left, this is the root
  # If there is only one character, one code, this never gets entered
  while q.qsize() > 1:
      left, right = q.get(), q.get()
      q.put(Node("", left.freq + right.freq, left, right))

  # The root is the only one left in the queue
  root = q.get()

  # Create a dict of the codes
  codes = {}
  get_codes(root, "", codes)

  # Now encode
  encoded_data = ""
  for c in data:
      encoded_data += codes[c]

  return encoded_data, Tree(root)

def huffman_decoding(data,tree):

  if data == None or tree == None:
      return ""

  node = tree.get_root()
  string = ""

  for bit in data:

      if bit == '0':
          node = node.left
      else:
          node = node.right

      if node.left == None and node.right == None:
          string += node.ch
          node = tree.get_root()

  return string

def test(string):

  if len(string) < 40:
      print ("Testing: \"%s\"" % string)
  else:
      print ("Testing: long string of %d characters" % len(string))
  encoded_data, tree = huffman_encoding(string)
  decoded_data = huffman_decoding(encoded_data, tree)
  """
  print ("The size of the data is: {}\n".format(sys.getsizeof(a_great_sentence)))
  print ("The content of the data is: {}\n".format(a_great_sentence))

  encoded_data, tree = huffman_encoding(a_great_sentence)
  print(tree)

  print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2))))
  print ("The content of the encoded data is: {}\n".format(encoded_data))

  decoded_data = huffman_decoding(encoded_data, tree)

  print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data)))
  print ("The content of the decoded data is: {}\n".format(decoded_data))
  """

  if decoded_data != string:
      print ("\t\tError: expected \"%s\", got \"%s\"" % (string, decoded_data))

# Test sentences test(“The bird is the word”) test(“cheesech eesecheesecheesecheesecheese2”)

# Test long string string = “A” for i in range(0,10000):

string += "B"

test(string) # Long string

# Edge cases test(“”) # Edge case: empty string test(“A”) # Edge case: One character only test(“AAAAAAAAAAAAAAAAAAAAA”) # Edge case: repeating only one character