diff --git a/data-processing/Drivefolder b/data-processing/Drivefolder new file mode 100644 index 00000000..8f82f234 --- /dev/null +++ b/data-processing/Drivefolder @@ -0,0 +1,4 @@ +Project folder: +The results of anonymized data are present. + +https://drive.google.com/drive/folders/1tnv5quKsPEqH7pZsuEiPP-go8kLkg8sO?usp=sharing diff --git a/data-processing/README.md b/data-processing/README.md new file mode 100644 index 00000000..7fcd0fb2 --- /dev/null +++ b/data-processing/README.md @@ -0,0 +1,48 @@ +**Problem Statement** +The objective was to: + +Generate a CSV file containing the following columns: + +first_name +last_name +address +date_of_birth +Process the generated CSV file to anonymize sensitive information. The columns to be anonymized are: + +first_name +last_name +address +Ensure the solution works with a large dataset, specifically around 2GB in size, and demonstrate that it can handle even larger datasets efficiently. + +Utilize a distributed computing platform to process large datasets effectively. In this project, Snowflake was chosen for this purpose. + +**Approach:** +Python: For generating synthetic data using the Faker library. +Snowflake: A cloud-based data warehousing platform used for large-scale data processing and anonymization. +SQL: To perform data manipulation and anonymization within Snowflake. +GitHub: sharing the project. +Google Drive: For sharing large datasets, as GitHub has file size limitations. + +Step 1: Data Generation + Python’s Faker library was used to create names, addresses, and dates of birth. +**Python code can be seen in repository as samplefakedatagenerator.py** + +Step 2: Loading Data into Snowflake +Since Snowflake’s web UI has a file upload limit of 250MB, the generated dataset needed to be split into smaller parts before loading. + +Splitting the Large CSV File - **large_dataset.csv** +To split the large CSV file into manageable parts, the following command was used in the terminal: + +command: +split -b 200m large_dataset.csv part + +this resulted files named part_aa, part_ab, part_ac, to part aj. + +Step 3: Data Anonymization: +create table and performed Anonymization using SHA-256 hashing algorithm. +**SQL script attached - Anonymization.sql** +and Exporting Anonymized Data in a final csv file named **anonymized_data.csv** + +Attaching the google drive link: https://drive.google.com/drive/folders/1tnv5quKsPEqH7pZsuEiPP-go8kLkg8sO?usp=sharing + + diff --git a/data-processing/anonymization.sql b/data-processing/anonymization.sql new file mode 100644 index 00000000..25ad35b9 --- /dev/null +++ b/data-processing/anonymization.sql @@ -0,0 +1,45 @@ +--creating table called user_data +CREATE OR REPLACE TABLE user_data ( + first_name STRING, + last_name STRING, + address STRING, + date_of_birth DATE +); + +CREATE OR REPLACE STAGE my_stage; + + +SELECT count(*) FROM user_data; + +--anonymization +UPDATE user_data +SET +first_name = SHA2(first_name, 256), +last_name = SHA2(last_name, 256), +address = SHA2(address, 256 + +SELECT * FROM MY_STAGE LIMIT 10 + + +--copying into csv +COPY INTO @~/processed_anonymized_data.csv +FROM user_data +FILE_FORMAT = (TYPE = 'CSV', FIELD_OPTIONALLY_ENCLOSED_BY = '"'); + + +LIST @~; + + +CREATE OR REPLACE TABLE temp_data AS +SELECT * FROM user_data; + + +LIST @~; + +--testing +COPY INTO @~/final_anonymized_data.csv +FROM temp_data +FILE_FORMAT = (TYPE = 'CSV', FIELD_OPTIONALLY_ENCLOSED_BY = '"') +SINGLE = TRUE +MAX_FILE_SIZE = 5368709120; -- 5 GB + diff --git a/data-processing/samplefakedatagenerator.py b/data-processing/samplefakedatagenerator.py new file mode 100644 index 00000000..1a45f15b --- /dev/null +++ b/data-processing/samplefakedatagenerator.py @@ -0,0 +1,52 @@ +import csv +from faker import Faker +import time + +fake = Faker() +fake.seed_instance(42) + +INITIAL_ROWS = 10_000_000 # First batch of data +ADDITIONAL_ROWS = 15_000_000 # Second batch to reach ~25 million total +BATCH_SIZE = 100_000 +FILE_NAME = 'large_dataset.csv' + +HEADERS = ['first_name', 'last_name', 'address', 'date_of_birth'] + +start_time = time.time() + +def generate_data(num_rows, mode='w'): + data_buffer = [] + with open(FILE_NAME, mode=mode, newline='', encoding='utf-8') as file: + writer = csv.writer(file) + + if mode == 'w': + writer.writerow(HEADERS) + + for i in range(1, num_rows + 1): + data_buffer.append([ + fake.first_name(), + fake.last_name(), + fake.address().replace("\n", ", "), + fake.date_of_birth(minimum_age=18, maximum_age=90) + ]) + + if i % BATCH_SIZE == 0: + writer.writerows(data_buffer) + data_buffer = [] + print(f"Added {i:,} rows...") + + if data_buffer: + writer.writerows(data_buffer) + +# First Batch: 10 Million Rows +print("initial dataset (10 million rows)...") +generate_data(INITIAL_ROWS, mode='w') + +# Second Batch: Append 15 Million More Rows +fake.seed_instance(None) +print("Appending additional dataset (15 million rows)...") +generate_data(ADDITIONAL_ROWS, mode='a') + +end_time = time.time() +total_rows = INITIAL_ROWS + ADDITIONAL_ROWS +print(f"Generated {total_rows:,} rows in {end_time - start_time:.2f} seconds!") diff --git a/fixed-width-parser/Dockerfile b/fixed-width-parser/Dockerfile new file mode 100644 index 00000000..fbd3d697 --- /dev/null +++ b/fixed-width-parser/Dockerfile @@ -0,0 +1,4 @@ +FROM python:3.9-slim +WORKDIR /app +COPY . . +CMD ["python", "parser.py"] diff --git a/fixed-width-parser/README.md b/fixed-width-parser/README.md new file mode 100644 index 00000000..0cee5de0 --- /dev/null +++ b/fixed-width-parser/README.md @@ -0,0 +1,19 @@ +**Problem:** Parse fixed width file + +Generate a fixed width file using the provided spec (offset provided in the spec file represent the length of each field). + +Implement a parser that can parse the fixed width file and generate a delimited file, like CSV for example. + +Instructions given and followed: + +DO NOT use python libraries like pandas for parsing. You can use the standard library to write out a csv file (If you feel like) + +Language choices (Python or Scala) + +Deliver source via github or bitbucket + +Bonus points if you deliver a docker container (Dockerfile) that can be used to run the code (too lazy to install stuff that you might use) + +Pay attention to encoding + +**Dockerfile** is delivered. diff --git a/fixed-width-parser/data.csv b/fixed-width-parser/data.csv new file mode 100644 index 00000000..dd8e526f --- /dev/null +++ b/fixed-width-parser/data.csv @@ -0,0 +1,5 @@ +f1,f2,f3,f4,f5,f6,f7,f8,f9,f10 +1001,Michael Jord,MJ,J,michael.jorda,USA,3125557890,Chicago,Basketball Legend,Retired +1002,Emma Watson,EW,W,emma.watson@h,UK,2075554321,London,Actress & Activist,Celebrity +1003,Elon Musk,EM,M,elon.musk@tes,USA,6505551234,Austin,Tech Entrepreneur,CEO +1004,Olivia Brown,OB,B,olivia.brown@,CAN,4165556789,Toronto,,Teacher diff --git a/fixed-width-parser/data.fixed b/fixed-width-parser/data.fixed new file mode 100644 index 00000000..842e38df --- /dev/null +++ b/fixed-width-parser/data.fixed @@ -0,0 +1,5 @@ +f1 f2 f3 f4f5 f6 f7 f8 f9 f10 +1001 Michael JordMJ J michael.jordaUSA 3125557890Chicago Basketball Legend Retired +1002 Emma Watson EW W emma.watson@hUK 2075554321London Actress & Activist Celebrity +1003 Elon Musk EM M elon.musk@tesUSA 6505551234Austin Tech Entrepreneur CEO +1004 Olivia BrownOB B olivia.brown@CAN 4165556789Toronto Teacher diff --git a/fixed-width-parser/generator.py b/fixed-width-parser/generator.py new file mode 100644 index 00000000..2c6ec7c7 --- /dev/null +++ b/fixed-width-parser/generator.py @@ -0,0 +1,40 @@ +#Making the code with comments helpful for reference/understanding + +import json + +# Let's start with loading the specs from our very own json file +with open('spec.json', 'r') as f: + spec = json.load(f) + +# Now, let's extract the Columns, Offsets, Encoding details from our json file +columns = spec['ColumnNames'] # Lists out the column names such as f1, f2, f3 etc +offsets = list(map(int, spec['Offsets'])) # Convert strings to integers +encoding = spec.get('FixedWidthEncoding', 'utf-8') # Encoding purpose as we follow windows-1252 + +# generating some sample data +data = [ + {"f1": "1001", "f2": "Michael Jordan", "f3": "MJ", "f4": "J", "f5": "michael.jordan@nba.com", "f6": "USA", "f7": "3125557890", "f8": "Chicago", "f9": "Basketball Legend", "f10": "Retired"}, + {"f1": "1002", "f2": "Emma Watson", "f3": "EW", "f4": "W", "f5": "emma.watson@hollywood.org", "f6": "UK", "f7": "2075554321", "f8": "London", "f9": "Actress & Activist", "f10": "Celebrity"}, + {"f1": "1003", "f2": "Elon Musk", "f3": "EM", "f4": "M", "f5": "elon.musk@tesla.com", "f6": "USA", "f7": "6505551234", "f8": "Austin", "f9": "Tech Entrepreneur", "f10": "CEO"}, + {"f1": "1004", "f2": "Olivia Brown", "f3": "OB", "f4": "B", "f5": "olivia.brown@edu.edu", "f6": "CAN", "f7": "4165556789", "f8": "Toronto", "f9": "", "f10": "Teacher"} +] + + + +# Let's define a function for making the match as per the required width, like truncating or padding +def pad_or_truncate(text, width): + return str(text)[:width].ljust(width) + +# Now let's generate the fixed-width file +with open('data.fixed', 'w', encoding=encoding) as f: + if spec.get('IncludeHeader', 'False').lower() == 'true': #including header + header = ''.join(pad_or_truncate(col, width) for col, width in zip(columns, offsets)) + f.write(header + '\n') # Let's have a newline after the header + + # next comes, writing the actual data + for row in data: + line = ''.join(pad_or_truncate(row.get(col, ''), width) for col, width in zip(columns, offsets)) + f.write(line + '\n') # for newline + +# print the results +print("Fixed-width file named 'data.fixed' has been generated") diff --git a/fixed-width-parser/parser.py b/fixed-width-parser/parser.py new file mode 100644 index 00000000..0ef047a7 --- /dev/null +++ b/fixed-width-parser/parser.py @@ -0,0 +1,40 @@ +import json +import csv + +# Load the specifications from our json file +with open('spec.json', 'r') as f: + spec = json.load(f) + +# Extract the details +columns = spec['ColumnNames'] # Column names +offsets = list(map(int, spec['Offsets'])) # Convert strings to integers +fixed_encoding = spec.get('FixedWidthEncoding', 'utf-8') # Encoding used in fixed-width file +csv_encoding = spec.get('DelimitedEncoding', 'utf-8') # output CSV file + +# Now, time for calculating the beginning and end positions for each field +indices = [] +begin = 0 +for width in offsets: + end = begin + width + indices.append((begin, end)) # Save the begin and end positions + begin = end # Move to next starting point + +# Parse and write that to a CSV file +with open('data.fixed', 'r', encoding=fixed_encoding) as infile, \ + open('data.csv', 'w', newline='', encoding=csv_encoding) as outfile: + + writer = csv.writer(outfile) # Create a CSV writer object + writer.writerow(columns) # Write the header row in the CSV + + # Skip the header row if it already exists + if spec.get('IncludeHeader', 'False').lower() == 'true': + infile.readline() + + # Now, let's go through each line of the file and pull out the data we need + for line in infile: + # Extracting each field from the line using its position range + row = [line[start:end].strip() for (start, end) in indices] + writer.writerow(row) # Write the final extracted row to our CSV file + +# print the results +print("Fixed-width file has been parsed with 'data.csv' file") diff --git a/fixed-width-parser/spec.json b/fixed-width-parser/spec.json new file mode 100644 index 00000000..bd8312ac --- /dev/null +++ b/fixed-width-parser/spec.json @@ -0,0 +1,29 @@ +{ + "ColumnNames": [ + "f1", + "f2", + "f3", + "f4", + "f5", + "f6", + "f7", + "f8", + "f9", + "f10" + ], + "Offsets": [ + "5", + "12", + "3", + "2", + "13", + "7", + "10", + "13", + "20", + "13" + ], + "FixedWidthEncoding": "windows-1252", + "IncludeHeader": "True", + "DelimitedEncoding": "utf-8" +}