-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain.py
More file actions
118 lines (97 loc) · 3.78 KB
/
train.py
File metadata and controls
118 lines (97 loc) · 3.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pandas as pd
import joblib
import logging
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score
# Import custom modules
from src.preprocessing import TextCleaner
from src.features import MetaFeatureExtractor
# Setup Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def load_data(filepath):
"""Loads dataset from path."""
if not os.path.exists(filepath):
raise FileNotFoundError(f"Data file not found at {filepath}. Please download train.csv from Kaggle.")
return pd.read_csv(filepath)
def build_advanced_pipeline():
"""
Creates a Heterogeneous Pipeline:
Branch 1: NLP (TF-IDF)
Branch 2: Statistical Features (MetaFeatureExtractor)
Converges into GradientBoostingClassifier
"""
# Branch 1: NLP Processing
nlp_pipeline = Pipeline([
('cleaner', TextCleaner()),
('tfidf', TfidfVectorizer(max_features=3000))
])
# Branch 2: Meta Features
# We don't need cleaning for counting URLs/Hashtags, so we pass raw text
meta_pipeline = Pipeline([
('stats', MetaFeatureExtractor())
])
# Combine Branches
preprocessor = ColumnTransformer(
transformers=[
('nlp', nlp_pipeline, 'text'),
('meta', meta_pipeline, 'text')
]
)
# Full Pipeline with Classifier
full_pipeline = Pipeline([
('preprocessor', preprocessor),
('clf', GradientBoostingClassifier(random_state=42))
])
return full_pipeline
def main():
import os
# 1. Load Data
# For the user: If you don't have the real CSV, we create a dummy one dynamically
# so the code DOES NOT CRASH.
data_path = 'data/train.csv'
if not os.path.exists(data_path):
logging.warning("Real dataset not found. Generating dummy complex data...")
os.makedirs('data', exist_ok=True)
dummy_data = {
'text': [
'Fire in the building! Help!',
'This movie is straight fire lol',
'Forest fires spreading in California http://news.com',
'I am so tired today',
'Earthquake detected: Magnitude 5.3 #danger',
'My world was shaken by her beauty'
] * 20, # Multiply to simulate size
'target': [1, 0, 1, 0, 1, 0] * 20
}
pd.DataFrame(dummy_data).to_csv(data_path, index=False)
df = pd.read_csv(data_path)
# 2. Split Data
logging.info(f"Dataset loaded. Shape: {df.shape}")
X = df[['text']] # Pass as DataFrame for ColumnTransformer
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 3. Initialize Pipeline
pipeline = build_advanced_pipeline()
# 4. GridSearch (Complexity: Tuning multiple branches)
param_grid = {
'preprocessor__nlp__tfidf__ngram_range': [(1, 1), (1, 2)], # Unigrams vs Bigrams
'clf__n_estimators': [50, 100],
'clf__learning_rate': [0.1]
}
logging.info("Starting GridSearch optimization...")
search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)
search.fit(X_train, y_train)
# 5. Evaluation
logging.info(f"Best Params: {search.best_params_}")
y_pred = search.predict(X_test)
print("\n--- CLASSIFICATION REPORT ---")
print(classification_report(y_test, y_pred))
# 6. Save Model
joblib.dump(search.best_estimator_, 'dris_model_v1.pkl')
logging.info("Model saved successfully.")
if __name__ == "__main__":
main()