acm-uic · ParthPrajapati7 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/data/dataPreprocess.py b/data/dataPreprocess.py
@@ -0,0 +1,71 @@
+# Importing necessary libraries
+import os
+
+import numpy as np
+import pandas as pd
+
+# Loading the data
+df = pd.read_csv("raw/USA Housing Dataset.csv")
+
+# Deletion
+# Removing the columns we don't need
+df.drop(columns=["street", "country"], inplace=True)
+
+
+# Adjustment
+# Adjusting the columns to be easier to work with
+df["date"] = pd.to_datetime(
+    df["date"], errors="coerce"
+)  # Converts the column to a datetime datatype
+# df["price"] = np.log1p(df["price"])
+
+# Calculation
+# Creating new calculated columns
+
+# The times are split as numbers as these are easier to train on
+# Year not included because dataset limited to 2014
+# Time related
+df["Day of Week"] = df["date"].dt.dayofweek
+df["Season Sold"] = (df["date"].dt.month - 1) // 3 + 1
+
+# Subtraction
+df["House Age"] = df["date"].dt.year - df["yr_built"]
+df["Renovation Age"] = df["date"].dt.year - df["yr_renovated"]
+df["Is Renovated"] = df["date"].dt.year - df["yr_renovated"]
+
+# Ratios
+df["Lot-Living Ratio"] = np.where(
+    df["sqft_living"] != 0, df["sqft_lot"] / df["sqft_living"], np.nan
+)
+df["Basement Ratio"] = np.where(
+    df["sqft_living"] != 0, df["sqft_basement"] / df["sqft_living"], np.nan
+)
+df["Areas Per Bedroom"] = np.where(
+    df["bedrooms"] != 0, df["sqft_living"] / df["bedrooms"], np.nan
+)
+df["Bathrooms Per Bedroom"] = np.where(
+    df["bedrooms"] != 0, df["bathrooms"] / df["bedrooms"], np.nan
+)
+df["Bedrooms Per Floor"] = np.where(
+    df["floors"] != 0, df["bedrooms"] / df["floors"], np.nan
+)
+
+# Interactions
+df["Beds x Baths"] = df["bedrooms"] * df["bathrooms"]
+df["Sqft Living x Waterfront"] = df["sqft_living"] * df["waterfront"]
+df[["State", "ZIP Code"]] = df["statezip"].str.split(expand=True)
+
+# Deletion Pt 2
+# Sometimes its better to have the calculated column over the raw data, so we drop those here
+df.drop(columns=["statezip", "yr_built", "yr_renovated"], inplace=True)
+# Since the data is all from the same year, the year built and renovated isn't necessary, since the model will just be deriving the age from it
+
+# Get Dataframe as a CSV
+output_dir = "clean"
+os.makedirs(output_dir, exist_ok=True)
+df.to_csv(os.path.join(output_dir, "(Clean) USA Housing Dataset.csv"), index=False)
+
+print(
+    "✅ Data preprocessing complete – cleaned file saved to",
+    os.path.join(output_dir, "(Clean) USA Housing Dataset.csv"),
+)
diff --git a/data/modelTraining.py b/data/modelTraining.py
@@ -0,0 +1,65 @@
+import pandas as pd 
+import numpy as np
+
+df = pd.read_csv('clean/(Clean) USA Housing Dataset.csv')
+#dropping the unnecessary columns
+features = df.columns.tolist()
+features.remove('id')
+#The Independent and Dependent Variable 
+X=df[features].astype(np.float64)
+y=df.pop('price').astype(np.float32)
+
+#TRAINING + TESTING THE MODEL
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
+
+#Model Type 
+model = SGDRegressor(
+    loss='squared_loss',
+    penalty=None,
+    learning_rate='constant',
+    eta0=0.001,
+    random_state=42,
+    max_iter=200,
+    warm_start=True
+)
+
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+
+#Training Loop & Metrics
+n_epochs = 200
+train_mae, val_mae = [], []
+train_rsmse, val_rsmse = [], []
+train_r2, val_r2 = [], []
+
+for epoch in range(n_epochs):
+    model.partial_fit(X_train, y_train)
+
+    # Predictions 
+    y_pred_train = model.predict(X_train)
+    y_pred_val = model.predict(X_val)
+
+    # Metrics Calculation
+    train_mae.append(mean_absolute_error(y_train, y_pred_train))
+    val_mae.append(mean_absolute_error(y_val, y_pred_val))
+
+    train_rsmse.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
+    val_rsmse.append(np.sqrt(mean_squared_error(y_val, y_pred_val)))
+    train_r2.append(r2_score(y_train, y_pred_train))
+    val_r2.append(r2_score(y_val, y_pred_val))
+
+    if epoch > 0 and val_mae[-1] > min(val_mae[:-1]):
+        print(f"Early stopping after epoch {epoch+1}")
+        break
+
+#Final Evaluation on Test Set
+y_pred_test = model.predict(X_test)
+test_mae = mean_absolute_error(y_test, y_pred_test)
+test_rsmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+test_r2 = r2_score(y_test, y_pred_test)
+
+dump(model, 'models/baseline.joblib')
+json.dump(baseline_metrics, f, indent=4)
+
+