-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
55 lines (43 loc) · 2.57 KB
/
preprocessing.py
File metadata and controls
55 lines (43 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
from sklearn.preprocessing import StandardScaler
class Preprocessor:
def __init__(self, df, problem_type):
self.df = df
self.problem_type = problem_type.lower()
def preprocess_features(self):
print("[INFO] Preprocessing başladı...")
# Eksik veri kontrolü ve doldurma
if self.df.isnull().sum().sum() > 0:
print("[WARNING] Eksik veri bulundu! Eksik değerler ortalama ile dolduruluyor.")
self.df = self.df.fillna(self.df.mean(numeric_only=True))
# Problem 2 (ürün satış performansı analizi - örnek problem olabilir)
if self.problem_type == 'problem_2':
feature_columns = ['avg_unit_price', 'total_sales_count', 'avg_quantity_per_order', 'unique_customer_count']
id_column = 'product_id'
# Problem 3: Tedarikçi Segmentasyonu
elif self.problem_type == 'problem_3':
self.df['supplied_product_count'] = self.df.groupby('supplier_id')['product_id'].transform('nunique')
self.df['total_quantity_sold'] = self.df.groupby('supplier_id')['quantity'].transform('sum')
self.df['average_unit_price'] = self.df.groupby('supplier_id')['unit_price'].transform('mean')
self.df['order_count'] = self.df.groupby('supplier_id')['order_id'].transform('nunique')
feature_columns = ['supplied_product_count', 'total_quantity_sold', 'average_unit_price', 'order_count']
id_column = 'supplier_id'
# Problem 4: Ülkelere Göre Satış Deseni Analizi
elif self.problem_type == 'problem_4':
# Sorgudan gelen sütun isimlerini Python tarafında uyarlıyoruz
self.df.rename(columns={
'avg_order_value': 'avg_order_amount',
'avg_items_per_order': 'avg_products_per_order'
}, inplace=True)
feature_columns = ['total_orders', 'avg_order_amount', 'avg_products_per_order']
id_column = 'country'
else:
raise ValueError(f"[ERROR] Geçersiz problem türü: {self.problem_type}")
# Seçilen özellikleri al ve ölçekle
feature_df = self.df[feature_columns]
scaler = StandardScaler()
feature_df_scaled = pd.DataFrame(scaler.fit_transform(feature_df), columns=feature_columns)
# ID sütununu da ekle
result_df = pd.concat([self.df[[id_column]].reset_index(drop=True), feature_df_scaled], axis=1)
print(f"[INFO] Preprocessing tamamlandı. Problem: {self.problem_type}")
return result_df