huffman_encoding
This is an old revision of the document!
Huffman Encoding
From Data Algo Class
(env) ms-macbook-pro:P1 jrichards$ vi p3.py
import sys import queue
class Node:
def __init__(self, ch, freq, left, right):
self.left = left
self.right = right
self.ch = ch
self.freq = freq
class Tree:
def __init__(self, root = None):
self.root = root
def get_root(self):
return self.root
def __str__(self):
visit_order = list()
def traverse(node):
if node:
# visit the node
visit_order.append(node.freq)
if node.left == None and node.right == None:
visit_order.append('(' + node.ch + ')')
# traverse left subtree
traverse(node.left)
# traverse right subtree
traverse(node.right)
traverse(self.root)
return str(visit_order)
def get_codes(node, code, codes_dict):
if node == None:
return
if node.left == None and node.right == None:
codes_dict[node.ch] = code
return
get_codes(node.left, code + '0', codes_dict) get_codes(node.right, code + '1', codes_dict)
def huffman_encoding(data):
# If this is an empty string, no encoding
if not data:
return None, None
# Create a dict of the letters and their frequency
freqs = {}
for c in data:
count = freqs.get(c, 0)
freqs[c] = count + 1
# Sort the dict items ans create a lit of tuples sorted by frequency sorted_by_freq = sorted(freqs.items(), key=lambda kv: kv[1])
# Create a queue to store nodes q = queue.Queue()
# Create a leaf node for each character/freq and put in queue
for letter_freq in sorted_by_freq:
ch = letter_freq[0]
freq = letter_freq[1]
q.put(Node(ch, freq, None, None))
# If there is only one character, one code, create the root
if q.qsize() == 1:
left = q.get()
q.put(Node("", left.freq, left, None))
# Group the nodes in pairs till there is only one node left, this is the root
# If there is only one character, one code, this never gets entered
while q.qsize() > 1:
left, right = q.get(), q.get()
q.put(Node("", left.freq + right.freq, left, right))
# The root is the only one left in the queue root = q.get()
# Create a dict of the codes
codes = {}
get_codes(root, "", codes)
# Now encode
encoded_data = ""
for c in data:
encoded_data += codes[c]
return encoded_data, Tree(root)
def huffman_decoding(data,tree):
if data == None or tree == None:
return ""
node = tree.get_root() string = ""
for bit in data:
if bit == '0':
node = node.left
else:
node = node.right
if node.left == None and node.right == None:
string += node.ch
node = tree.get_root()
return string
def test(string):
if len(string) < 40:
print ("Testing: \"%s\"" % string)
else:
print ("Testing: long string of %d characters" % len(string))
encoded_data, tree = huffman_encoding(string)
decoded_data = huffman_decoding(encoded_data, tree)
"""
print ("The size of the data is: {}\n".format(sys.getsizeof(a_great_sentence)))
print ("The content of the data is: {}\n".format(a_great_sentence))
encoded_data, tree = huffman_encoding(a_great_sentence) print(tree)
print ("The size of the encoded data is: {}\n".format(sys.getsizeof(int(encoded_data, base=2))))
print ("The content of the encoded data is: {}\n".format(encoded_data))
decoded_data = huffman_decoding(encoded_data, tree)
print ("The size of the decoded data is: {}\n".format(sys.getsizeof(decoded_data)))
print ("The content of the decoded data is: {}\n".format(decoded_data))
"""
if decoded_data != string:
print ("\t\tError: expected \"%s\", got \"%s\"" % (string, decoded_data))
# Test sentences test(“The bird is the word”) test(“cheesech eesecheesecheesecheesecheese2”)
# Test long string string = “A” for i in range(0,10000):
string += "B"
test(string) # Long string
# Edge cases test(“”) # Edge case: empty string test(“A”) # Edge case: One character only test(“AAAAAAAAAAAAAAAAAAAAA”) # Edge case: repeating only one character
huffman_encoding.1578805077.txt.gz · Last modified: 2020/01/12 04:57 by jrseti