|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | +import random |
| 4 | + |
| 5 | +# Settings |
| 6 | +num_rows = 1000 # Factory workers |
| 7 | +num_columns = 12 # Data columns |
| 8 | + |
| 9 | +# Base columns (will be reused in patterns) |
| 10 | +worker_ids = [f"W{1000 + i}" for i in range(num_rows)] |
| 11 | +ages = np.random.randint(20, 60, size=num_rows) |
| 12 | +experience_years = [random.randint(0, age - 20) for age in ages] |
| 13 | +departments = np.random.choice(['Assembly', 'Maintenance', 'Packaging', 'QC', 'Logistics'], size=num_rows) |
| 14 | + |
| 15 | +# Start the dataframe |
| 16 | +df = pd.DataFrame({ |
| 17 | + 'WorkerID': worker_ids, |
| 18 | + 'Age': ages, |
| 19 | + 'Experience': experience_years, |
| 20 | + 'Department': departments |
| 21 | +}) |
| 22 | + |
| 23 | +# Generate 9996 additional columns with related sensor/factory data |
| 24 | +for i in range(1, num_columns - 3): # -3 because we already added 4 columns above |
| 25 | + col_name = f"Sensor_{i}" |
| 26 | + # Example logic: related to experience and age, with noise |
| 27 | + df[col_name] = df['Experience'] * np.random.uniform(0.5, 1.5) + np.random.normal(0, 5, size=num_rows) |
| 28 | + |
| 29 | + # Insert NaNs randomly for data cleaning practice |
| 30 | + mask = np.random.rand(num_rows) < 0.05 # ~5% missing |
| 31 | + df.loc[mask, col_name] = np.nan |
| 32 | + |
| 33 | +# Save to CSV |
| 34 | +df.to_csv("factory_worker_data.csv", index=False) |
| 35 | + |
| 36 | +print("CSV with 10,000+ columns and realistic values generated successfully.") |
0 commit comments