Skip to content

Commit 30da1c0

Browse files
committed
add data processor class
1 parent a0fc632 commit 30da1c0

File tree

1 file changed

+99
-0
lines changed

1 file changed

+99
-0
lines changed

src/make_data/data_processor.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import logging
2+
from typing import Optional, Union
3+
import pandas as pd
4+
from sklearn.model_selection import train_test_split
5+
from project_config import ProjectConfig
6+
from utils import outlier_imputer, rush_hourizer
7+
8+
logging.basicConfig(
9+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
10+
)
11+
logger = logging.getLogger(__name__)
12+
13+
14+
class DataProcessor:
15+
def __init__(self, df: pd.DataFrame, config: ProjectConfig):
16+
"""
17+
Class to process taxi data for model training
18+
19+
Args:
20+
config (ProjectConfig): ProjectConfig object
21+
"""
22+
self.df = df
23+
self.config = config
24+
25+
def process_data(self):
26+
"""Process raw data"""
27+
self.df.drop_duplicates(inplace=True)
28+
29+
self.df["lpep_pickup_datetime"] = pd.to_datetime(
30+
self.df["lpep_pickup_datetime"]
31+
)
32+
self.df["lpep_dropoff_datetime"] = pd.to_datetime(
33+
self.df["lpep_dropoff_datetime"]
34+
)
35+
36+
self.df["duration"] = (
37+
self.df["lpep_dropoff_datetime"] - self.df["lpep_pickup_datetime"]
38+
)
39+
self.df["duration"] = self.df["duration"].dt.total_seconds() // 60
40+
41+
self.df.loc[self.df["fare_amount"] < 0, "fare_amount"] = 0
42+
self.df.loc[self.df["duration"] < 0, "duration"] = 0
43+
44+
self.df = outlier_imputer(self.df, ["fare_amount"], 6)
45+
self.df = outlier_imputer(self.df, ["duration"], 6)
46+
47+
self.df["pickup_dropoff"] = (
48+
self.df["PULocationID"].astype(str)
49+
+ " "
50+
+ self.df["DOLocationID"].astype(str)
51+
)
52+
grouped = self.df.groupby("pickup_dropoff").mean(numeric_only=True)[
53+
["trip_distance"]
54+
]
55+
grouped_dict = grouped.to_dict()["trip_distance"]
56+
self.df["mean_distance"] = self.df["pickup_dropoff"]
57+
self.df["mean_distance"] = self.df["mean_distance"].map(grouped_dict)
58+
59+
grouped = self.df.groupby("pickup_dropoff").mean(numeric_only=True)[
60+
["duration"]
61+
]
62+
grouped_dict = grouped.to_dict()["duration"]
63+
self.df["mean_duration"] = self.df["pickup_dropoff"]
64+
self.df["mean_duration"] = self.df["mean_duration"].map(grouped_dict)
65+
66+
self.df["day"] = self.df["lpep_pickup_datetime"].dt.day_name().str.lower()
67+
self.df["month"] = self.df["lpep_pickup_datetime"].dt.strftime("%b").str.lower()
68+
self.df["rush_hour"] = self.df["lpep_pickup_datetime"].dt.hour
69+
self.df.loc[self.df["day"].isin(["saturday", "sunday"]), "rush_hour"] = 0
70+
71+
self.df["rush_hour"] = self.df["rush_hour"].astype(int)
72+
mask = (self.df["day"] != "saturday") & (self.df["day"] != "sunday")
73+
self.df.loc[mask, "rush_hour"] = self.df.loc[mask].apply(rush_hourizer, axis=1)
74+
75+
self.df.rename(columns={"VendorID": "vendor_id"}, inplace=True)
76+
77+
relevant_cols = self.config.num_features + self.config.target
78+
79+
self.df = self.df.loc[:, relevant_cols]
80+
self.df.dropna(inplace=True)
81+
82+
def split_data(
83+
self, test_size: Optional[float] = 0.2, random_state: Optional[int] = 42
84+
) -> Union[pd.DataFrame, pd.DataFrame]:
85+
"""
86+
Split data into train and test sets
87+
88+
Args:
89+
test_size (float, optional): Size of test set. Defaults to 0.2.
90+
random_state (int, optional): Random state. Defaults to 42.
91+
92+
Returns:
93+
Union[pd.DataFrame, pd.DataFrame]: Train and test pandas dataframes
94+
"""
95+
96+
train_set, test_set = train_test_split(
97+
self.df, test_size=test_size, random_state=random_state
98+
)
99+
return train_set, test_set

0 commit comments

Comments
 (0)