From d1200cbed435e6dbd32d38f23a598b250507abda Mon Sep 17 00:00:00 2001 From: sushanth shetty <11s.shettyy@gmail.com> Date: Thu, 5 Dec 2024 21:04:36 +0545 Subject: [PATCH 1/2] sushanth --- compression/huffman.py | 147 +++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 70 deletions(-) diff --git a/compression/huffman.py b/compression/huffman.py index 44eda6c03180..d591d4be4616 100644 --- a/compression/huffman.py +++ b/compression/huffman.py @@ -1,92 +1,99 @@ -from __future__ import annotations - +import heapq +from collections import defaultdict import sys +class HuffmanNode: + def __init__(self, char=None, freq=0): + self.char = char + self.freq = freq + self.left = None + self.right = None -class Letter: - def __init__(self, letter: str, freq: int): - self.letter: str = letter - self.freq: int = freq - self.bitstring: dict[str, str] = {} - - def __repr__(self) -> str: - return f"{self.letter}:{self.freq}" + def __lt__(self, other): + return self.freq < other.freq -class TreeNode: - def __init__(self, freq: int, left: Letter | TreeNode, right: Letter | TreeNode): - self.freq: int = freq - self.left: Letter | TreeNode = left - self.right: Letter | TreeNode = right +def calculate_frequencies(file_path): + """ + Reads the file and calculates the frequency of each character. + """ + freq = defaultdict(int) + with open(file_path, 'r') as file: + for line in file: + for char in line: + freq[char] += 1 + return freq -def parse_file(file_path: str) -> list[Letter]: +def build_huffman_tree(freq_dict): """ - Read the file and build a dict of all letters and their - frequencies, then convert the dict into a list of Letters. + Builds the Huffman tree using a priority queue. """ - chars: dict[str, int] = {} - with open(file_path) as f: - while True: - c = f.read(1) - if not c: - break - chars[c] = chars[c] + 1 if c in chars else 1 - return sorted((Letter(c, f) for c, f in chars.items()), key=lambda x: x.freq) + priority_queue = [HuffmanNode(char, freq) for char, freq in freq_dict.items()] + heapq.heapify(priority_queue) + + while len(priority_queue) > 1: + left = heapq.heappop(priority_queue) + right = heapq.heappop(priority_queue) + + merged = HuffmanNode(freq=left.freq + right.freq) + merged.left = left + merged.right = right + + heapq.heappush(priority_queue, merged) + + return priority_queue[0] -def build_tree(letters: list[Letter]) -> Letter | TreeNode: +def generate_codes(node, current_code="", code_map=None): """ - Run through the list of Letters and build the min heap - for the Huffman Tree. + Generates the Huffman codes by traversing the tree recursively. """ - response: list[Letter | TreeNode] = list(letters) - while len(response) > 1: - left = response.pop(0) - right = response.pop(0) - total_freq = left.freq + right.freq - node = TreeNode(total_freq, left, right) - response.append(node) - response.sort(key=lambda x: x.freq) - return response[0] - - -def traverse_tree(root: Letter | TreeNode, bitstring: str) -> list[Letter]: + if code_map is None: + code_map = {} + + if node is not None: + if node.char is not None: + code_map[node.char] = current_code + + generate_codes(node.left, current_code + "0", code_map) + generate_codes(node.right, current_code + "1", code_map) + + return code_map + + +def encode_file(file_path, code_map): """ - Recursively traverse the Huffman Tree to set each - Letter's bitstring dictionary, and return the list of Letters + Encodes the file contents using the Huffman codes. """ - if isinstance(root, Letter): - root.bitstring[root.letter] = bitstring - return [root] - treenode: TreeNode = root - letters = [] - letters += traverse_tree(treenode.left, bitstring + "0") - letters += traverse_tree(treenode.right, bitstring + "1") - return letters + encoded_output = [] + with open(file_path, 'r') as file: + for line in file: + for char in line: + encoded_output.append(code_map[char]) + + return ''.join(encoded_output) -def huffman(file_path: str) -> None: +def huffman(file_path): """ - Parse the file, build the tree, then run through the file - again, using the letters dictionary to find and print out the - bitstring for each letter. + Main function to perform Huffman encoding on a given file. """ - letters_list = parse_file(file_path) - root = build_tree(letters_list) - letters = { - k: v for letter in traverse_tree(root, "") for k, v in letter.bitstring.items() - } - print(f"Huffman Coding of {file_path}: ") - with open(file_path) as f: - while True: - c = f.read(1) - if not c: - break - print(letters[c], end=" ") - print() + freq_dict = calculate_frequencies(file_path) + huffman_tree_root = build_huffman_tree(freq_dict) + code_map = generate_codes(huffman_tree_root) + + print(f"Huffman Codes for characters in {file_path}:") + for char, code in code_map.items(): + print(f"'{char}': {code}") + + encoded_data = encode_file(file_path, code_map) + print("\nEncoded Data:") + print(encoded_data) if __name__ == "__main__": - # pass the file path to the huffman function - huffman(sys.argv[1]) + if len(sys.argv) < 2: + print("Usage: python huffman.py ") + else: + huffman(sys.argv[1]) From 25bf1925461b2c07288a00070c5ae97b6294e562 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:25:35 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- compression/huffman.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/compression/huffman.py b/compression/huffman.py index d591d4be4616..95b69170e33b 100644 --- a/compression/huffman.py +++ b/compression/huffman.py @@ -2,6 +2,7 @@ from collections import defaultdict import sys + class HuffmanNode: def __init__(self, char=None, freq=0): self.char = char @@ -18,7 +19,7 @@ def calculate_frequencies(file_path): Reads the file and calculates the frequency of each character. """ freq = defaultdict(int) - with open(file_path, 'r') as file: + with open(file_path, "r") as file: for line in file: for char in line: freq[char] += 1 @@ -67,12 +68,12 @@ def encode_file(file_path, code_map): Encodes the file contents using the Huffman codes. """ encoded_output = [] - with open(file_path, 'r') as file: + with open(file_path, "r") as file: for line in file: for char in line: encoded_output.append(code_map[char]) - return ''.join(encoded_output) + return "".join(encoded_output) def huffman(file_path):