|
| 1 | +import logging |
| 2 | +from typing import Optional, Union |
| 3 | +import pandas as pd |
| 4 | +from sklearn.model_selection import train_test_split |
| 5 | +from project_config import ProjectConfig |
| 6 | +from utils import outlier_imputer, rush_hourizer |
| 7 | + |
| 8 | +logging.basicConfig( |
| 9 | + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" |
| 10 | +) |
| 11 | +logger = logging.getLogger(__name__) |
| 12 | + |
| 13 | + |
| 14 | +class DataProcessor: |
| 15 | + def __init__(self, df: pd.DataFrame, config: ProjectConfig): |
| 16 | + """ |
| 17 | + Class to process taxi data for model training |
| 18 | +
|
| 19 | + Args: |
| 20 | + config (ProjectConfig): ProjectConfig object |
| 21 | + """ |
| 22 | + self.df = df |
| 23 | + self.config = config |
| 24 | + |
| 25 | + def process_data(self): |
| 26 | + """Process raw data""" |
| 27 | + self.df.drop_duplicates(inplace=True) |
| 28 | + |
| 29 | + self.df["lpep_pickup_datetime"] = pd.to_datetime( |
| 30 | + self.df["lpep_pickup_datetime"] |
| 31 | + ) |
| 32 | + self.df["lpep_dropoff_datetime"] = pd.to_datetime( |
| 33 | + self.df["lpep_dropoff_datetime"] |
| 34 | + ) |
| 35 | + |
| 36 | + self.df["duration"] = ( |
| 37 | + self.df["lpep_dropoff_datetime"] - self.df["lpep_pickup_datetime"] |
| 38 | + ) |
| 39 | + self.df["duration"] = self.df["duration"].dt.total_seconds() // 60 |
| 40 | + |
| 41 | + self.df.loc[self.df["fare_amount"] < 0, "fare_amount"] = 0 |
| 42 | + self.df.loc[self.df["duration"] < 0, "duration"] = 0 |
| 43 | + |
| 44 | + self.df = outlier_imputer(self.df, ["fare_amount"], 6) |
| 45 | + self.df = outlier_imputer(self.df, ["duration"], 6) |
| 46 | + |
| 47 | + self.df["pickup_dropoff"] = ( |
| 48 | + self.df["PULocationID"].astype(str) |
| 49 | + + " " |
| 50 | + + self.df["DOLocationID"].astype(str) |
| 51 | + ) |
| 52 | + grouped = self.df.groupby("pickup_dropoff").mean(numeric_only=True)[ |
| 53 | + ["trip_distance"] |
| 54 | + ] |
| 55 | + grouped_dict = grouped.to_dict()["trip_distance"] |
| 56 | + self.df["mean_distance"] = self.df["pickup_dropoff"] |
| 57 | + self.df["mean_distance"] = self.df["mean_distance"].map(grouped_dict) |
| 58 | + |
| 59 | + grouped = self.df.groupby("pickup_dropoff").mean(numeric_only=True)[ |
| 60 | + ["duration"] |
| 61 | + ] |
| 62 | + grouped_dict = grouped.to_dict()["duration"] |
| 63 | + self.df["mean_duration"] = self.df["pickup_dropoff"] |
| 64 | + self.df["mean_duration"] = self.df["mean_duration"].map(grouped_dict) |
| 65 | + |
| 66 | + self.df["day"] = self.df["lpep_pickup_datetime"].dt.day_name().str.lower() |
| 67 | + self.df["month"] = self.df["lpep_pickup_datetime"].dt.strftime("%b").str.lower() |
| 68 | + self.df["rush_hour"] = self.df["lpep_pickup_datetime"].dt.hour |
| 69 | + self.df.loc[self.df["day"].isin(["saturday", "sunday"]), "rush_hour"] = 0 |
| 70 | + |
| 71 | + self.df["rush_hour"] = self.df["rush_hour"].astype(int) |
| 72 | + mask = (self.df["day"] != "saturday") & (self.df["day"] != "sunday") |
| 73 | + self.df.loc[mask, "rush_hour"] = self.df.loc[mask].apply(rush_hourizer, axis=1) |
| 74 | + |
| 75 | + self.df.rename(columns={"VendorID": "vendor_id"}, inplace=True) |
| 76 | + |
| 77 | + relevant_cols = self.config.num_features + self.config.target |
| 78 | + |
| 79 | + self.df = self.df.loc[:, relevant_cols] |
| 80 | + self.df.dropna(inplace=True) |
| 81 | + |
| 82 | + def split_data( |
| 83 | + self, test_size: Optional[float] = 0.2, random_state: Optional[int] = 42 |
| 84 | + ) -> Union[pd.DataFrame, pd.DataFrame]: |
| 85 | + """ |
| 86 | + Split data into train and test sets |
| 87 | +
|
| 88 | + Args: |
| 89 | + test_size (float, optional): Size of test set. Defaults to 0.2. |
| 90 | + random_state (int, optional): Random state. Defaults to 42. |
| 91 | +
|
| 92 | + Returns: |
| 93 | + Union[pd.DataFrame, pd.DataFrame]: Train and test pandas dataframes |
| 94 | + """ |
| 95 | + |
| 96 | + train_set, test_set = train_test_split( |
| 97 | + self.df, test_size=test_size, random_state=random_state |
| 98 | + ) |
| 99 | + return train_set, test_set |
0 commit comments