Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions data/dataPreprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Importing necessary libraries
import os

import numpy as np
import pandas as pd

# Loading the data
df = pd.read_csv("raw/USA Housing Dataset.csv")

# Deletion
# Removing the columns we don't need
df.drop(columns=["street", "country"], inplace=True)


# Adjustment
# Adjusting the columns to be easier to work with
df["date"] = pd.to_datetime(
df["date"], errors="coerce"
) # Converts the column to a datetime datatype
# df["price"] = np.log1p(df["price"])

# Calculation
# Creating new calculated columns

# The times are split as numbers as these are easier to train on
# Year not included because dataset limited to 2014
# Time related
df["Day of Week"] = df["date"].dt.dayofweek
df["Season Sold"] = (df["date"].dt.month - 1) // 3 + 1

# Subtraction
df["House Age"] = df["date"].dt.year - df["yr_built"]
df["Renovation Age"] = df["date"].dt.year - df["yr_renovated"]
df["Is Renovated"] = df["date"].dt.year - df["yr_renovated"]

# Ratios
df["Lot-Living Ratio"] = np.where(
df["sqft_living"] != 0, df["sqft_lot"] / df["sqft_living"], np.nan
)
df["Basement Ratio"] = np.where(
df["sqft_living"] != 0, df["sqft_basement"] / df["sqft_living"], np.nan
)
df["Areas Per Bedroom"] = np.where(
df["bedrooms"] != 0, df["sqft_living"] / df["bedrooms"], np.nan
)
df["Bathrooms Per Bedroom"] = np.where(
df["bedrooms"] != 0, df["bathrooms"] / df["bedrooms"], np.nan
)
df["Bedrooms Per Floor"] = np.where(
df["floors"] != 0, df["bedrooms"] / df["floors"], np.nan
)

# Interactions
df["Beds x Baths"] = df["bedrooms"] * df["bathrooms"]
df["Sqft Living x Waterfront"] = df["sqft_living"] * df["waterfront"]
df[["State", "ZIP Code"]] = df["statezip"].str.split(expand=True)

# Deletion Pt 2
# Sometimes its better to have the calculated column over the raw data, so we drop those here
df.drop(columns=["statezip", "yr_built", "yr_renovated"], inplace=True)
# Since the data is all from the same year, the year built and renovated isn't necessary, since the model will just be deriving the age from it

# Get Dataframe as a CSV
output_dir = "clean"
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir, "(Clean) USA Housing Dataset.csv"), index=False)

print(
"✅ Data preprocessing complete – cleaned file saved to",
os.path.join(output_dir, "(Clean) USA Housing Dataset.csv"),
)
65 changes: 65 additions & 0 deletions data/modelTraining.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import pandas as pd
import numpy as np

df = pd.read_csv('clean/(Clean) USA Housing Dataset.csv')
#dropping the unnecessary columns
features = df.columns.tolist()
features.remove('id')
#The Independent and Dependent Variable
X=df[features].astype(np.float64)
y=df.pop('price').astype(np.float32)

#TRAINING + TESTING THE MODEL
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

#Model Type
model = SGDRegressor(
loss='squared_loss',
penalty=None,
learning_rate='constant',
eta0=0.001,
random_state=42,
max_iter=200,
warm_start=True
)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#Training Loop & Metrics
n_epochs = 200
train_mae, val_mae = [], []
train_rsmse, val_rsmse = [], []
train_r2, val_r2 = [], []

for epoch in range(n_epochs):
model.partial_fit(X_train, y_train)

# Predictions
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)

# Metrics Calculation
train_mae.append(mean_absolute_error(y_train, y_pred_train))
val_mae.append(mean_absolute_error(y_val, y_pred_val))

train_rsmse.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
val_rsmse.append(np.sqrt(mean_squared_error(y_val, y_pred_val)))
train_r2.append(r2_score(y_train, y_pred_train))
val_r2.append(r2_score(y_val, y_pred_val))

if epoch > 0 and val_mae[-1] > min(val_mae[:-1]):
print(f"Early stopping after epoch {epoch+1}")
break

#Final Evaluation on Test Set
y_pred_test = model.predict(X_test)
test_mae = mean_absolute_error(y_test, y_pred_test)
test_rsmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
test_r2 = r2_score(y_test, y_pred_test)

dump(model, 'models/baseline.joblib')
json.dump(baseline_metrics, f, indent=4)