diff --git a/Assessment_sartaj.ipynb b/Assessment_sartaj.ipynb new file mode 100644 index 00000000..f9a361d7 --- /dev/null +++ b/Assessment_sartaj.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "fixed_width_parser" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "\n", + "class FixedWidthParser:\n", + " def __init__(self, spec_file, input_file, output_file, encoding=\"utf-8\"):\n", + " self.spec_file = spec_file\n", + " self.input_file = input_file\n", + " self.output_file = output_file\n", + " self.encoding = encoding\n", + " self.column_specs = []\n", + "\n", + " def parse_spec(self):\n", + " \"\"\"Reads the spec file and extracts column names and widths.\"\"\"\n", + " with open(self.spec_file, \"r\", encoding=self.encoding) as file:\n", + " for line in file:\n", + " parts = line.strip().split()\n", + " if len(parts) == 2:\n", + " column_name, width = parts\n", + " self.column_specs.append((column_name, int(width)))\n", + "\n", + " def parse_fixed_width_file(self):\n", + " \"\"\"Reads fixed-width data and writes to CSV.\"\"\"\n", + " self.parse_spec()\n", + " \n", + " with open(self.input_file, \"r\", encoding=self.encoding) as infile, \\\n", + " open(self.output_file, \"w\", newline=\"\", encoding=self.encoding) as outfile:\n", + " writer = csv.writer(outfile)\n", + " writer.writerow([col[0] for col in self.column_specs]) # Write header\n", + "\n", + " for line in infile:\n", + " values = []\n", + " start = 0\n", + " for _, width in self.column_specs:\n", + " values.append(line[start:start+width].strip())\n", + " start += width\n", + " writer.writerow(values)\n", + "\n", + "if __name__ == \"__main__\":\n", + " parser = FixedWidthParser(\"spec.txt\", \"input_data/fixed_width.txt\", \"output_data/output.csv\")\n", + " parser.parse_fixed_width_file()\n", + " print(\"✅ Fixed-width file successfully parsed to CSV!\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "data_anonymizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "import hashlib\n", + "import random\n", + "import string\n", + "\n", + "class DataAnonymizer:\n", + " def __init__(self, input_file, output_file, chunk_size=10000):\n", + " self.input_file = input_file\n", + " self.output_file = output_file\n", + " self.chunk_size = chunk_size\n", + " self.anonymized_map = {}\n", + "\n", + " def anonymize_text(self, text):\n", + " \"\"\"Generate a consistent hash-based anonymized string.\"\"\"\n", + " if text in self.anonymized_map:\n", + " return self.anonymized_map[text]\n", + " hashed = hashlib.sha256(text.encode()).hexdigest()[:10]\n", + " fake_value = \"\".join(random.choices(string.ascii_letters, k=10))\n", + " self.anonymized_map[text] = fake_value\n", + " return fake_value\n", + "\n", + " def anonymize_csv(self):\n", + " \"\"\"Reads and anonymizes data in chunks.\"\"\"\n", + " with open(self.input_file, \"r\", encoding=\"utf-8\") as infile, \\\n", + " open(self.output_file, \"w\", newline=\"\", encoding=\"utf-8\") as outfile:\n", + " reader = csv.reader(infile)\n", + " writer = csv.writer(outfile)\n", + "\n", + " # Read header\n", + " headers = next(reader)\n", + " writer.writerow(headers)\n", + "\n", + " for row in reader:\n", + " row[0] = self.anonymize_text(row[0]) # first_name\n", + " row[1] = self.anonymize_text(row[1]) # last_name\n", + " row[2] = self.anonymize_text(row[2]) # address\n", + " writer.writerow(row)\n", + "\n", + "if __name__ == \"__main__\":\n", + " anonymizer = DataAnonymizer(\"input_data/data.csv\", \"output_data/anonymized_data.csv\")\n", + " anonymizer.anonymize_csv()\n", + " print(\"✅ Data successfully anonymized!\")\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}