Skip to content

Commit b0f7abd

Browse files
committed
23rd June 2025
1 parent 123f140 commit b0f7abd

File tree

2 files changed

+73
-0
lines changed

2 files changed

+73
-0
lines changed

Data Cleaning.ipynb

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "initial_id",
7+
"metadata": {
8+
"collapsed": true
9+
},
10+
"outputs": [],
11+
"source": [
12+
""
13+
]
14+
}
15+
],
16+
"metadata": {
17+
"kernelspec": {
18+
"display_name": "Python 3",
19+
"language": "python",
20+
"name": "python3"
21+
},
22+
"language_info": {
23+
"codemirror_mode": {
24+
"name": "ipython",
25+
"version": 2
26+
},
27+
"file_extension": ".py",
28+
"mimetype": "text/x-python",
29+
"name": "python",
30+
"nbconvert_exporter": "python",
31+
"pygments_lexer": "ipython2",
32+
"version": "2.7.6"
33+
}
34+
},
35+
"nbformat": 4,
36+
"nbformat_minor": 5
37+
}

Data maker.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import pandas as pd
2+
import numpy as np
3+
import random
4+
5+
# Settings
6+
num_rows = 1000 # Factory workers
7+
num_columns = 12 # Data columns
8+
9+
# Base columns (will be reused in patterns)
10+
worker_ids = [f"W{1000 + i}" for i in range(num_rows)]
11+
ages = np.random.randint(20, 60, size=num_rows)
12+
experience_years = [random.randint(0, age - 20) for age in ages]
13+
departments = np.random.choice(['Assembly', 'Maintenance', 'Packaging', 'QC', 'Logistics'], size=num_rows)
14+
15+
# Start the dataframe
16+
df = pd.DataFrame({
17+
'WorkerID': worker_ids,
18+
'Age': ages,
19+
'Experience': experience_years,
20+
'Department': departments
21+
})
22+
23+
# Generate 9996 additional columns with related sensor/factory data
24+
for i in range(1, num_columns - 3): # -3 because we already added 4 columns above
25+
col_name = f"Sensor_{i}"
26+
# Example logic: related to experience and age, with noise
27+
df[col_name] = df['Experience'] * np.random.uniform(0.5, 1.5) + np.random.normal(0, 5, size=num_rows)
28+
29+
# Insert NaNs randomly for data cleaning practice
30+
mask = np.random.rand(num_rows) < 0.05 # ~5% missing
31+
df.loc[mask, col_name] = np.nan
32+
33+
# Save to CSV
34+
df.to_csv("factory_worker_data.csv", index=False)
35+
36+
print("CSV with 10,000+ columns and realistic values generated successfully.")

0 commit comments

Comments
 (0)