disaster-response-ai/train.py at main · usmansajid00/disaster-response-ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pandas as pd
import joblib
import logging
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score

# Import custom modules
from src.preprocessing import TextCleaner
from src.features import MetaFeatureExtractor

# Setup Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_data(filepath):
    """Loads dataset from path."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found at {filepath}. Please download train.csv from Kaggle.")
    return pd.read_csv(filepath)

def build_advanced_pipeline():
    """
    Creates a Heterogeneous Pipeline:
    Branch 1: NLP (TF-IDF)
    Branch 2: Statistical Features (MetaFeatureExtractor)
    Converges into GradientBoostingClassifier
    """

    # Branch 1: NLP Processing
    nlp_pipeline = Pipeline([
        ('cleaner', TextCleaner()),
        ('tfidf', TfidfVectorizer(max_features=3000))
    ])

    # Branch 2: Meta Features
    # We don't need cleaning for counting URLs/Hashtags, so we pass raw text
    meta_pipeline = Pipeline([
        ('stats', MetaFeatureExtractor())
    ])

    # Combine Branches
    preprocessor = ColumnTransformer(
        transformers=[
            ('nlp', nlp_pipeline, 'text'),
            ('meta', meta_pipeline, 'text')
        ]
    )

    # Full Pipeline with Classifier
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', GradientBoostingClassifier(random_state=42))
    ])

    return full_pipeline

def main():
    import os

    # 1. Load Data
    # For the user: If you don't have the real CSV, we create a dummy one dynamically
    # so the code DOES NOT CRASH.
    data_path = 'data/train.csv'
    if not os.path.exists(data_path):
        logging.warning("Real dataset not found. Generating dummy complex data...")
        os.makedirs('data', exist_ok=True)
        dummy_data = {
            'text': [
                'Fire in the building! Help!',
                'This movie is straight fire lol',
                'Forest fires spreading in California http://news.com',
                'I am so tired today',
                'Earthquake detected: Magnitude 5.3 #danger',
                'My world was shaken by her beauty'
            ] * 20, # Multiply to simulate size
            'target': [1, 0, 1, 0, 1, 0] * 20
        }
        pd.DataFrame(dummy_data).to_csv(data_path, index=False)

    df = pd.read_csv(data_path)

    # 2. Split Data
    logging.info(f"Dataset loaded. Shape: {df.shape}")
    X = df[['text']] # Pass as DataFrame for ColumnTransformer
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 3. Initialize Pipeline
    pipeline = build_advanced_pipeline()

    # 4. GridSearch (Complexity: Tuning multiple branches)
    param_grid = {
        'preprocessor__nlp__tfidf__ngram_range': [(1, 1), (1, 2)], # Unigrams vs Bigrams
        'clf__n_estimators': [50, 100],
        'clf__learning_rate': [0.1]
    }

    logging.info("Starting GridSearch optimization...")
    search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)
    search.fit(X_train, y_train)

    # 5. Evaluation
    logging.info(f"Best Params: {search.best_params_}")
    y_pred = search.predict(X_test)

    print("\n--- CLASSIFICATION REPORT ---")
    print(classification_report(y_test, y_pred))

    # 6. Save Model
    joblib.dump(search.best_estimator_, 'dris_model_v1.pkl')
    logging.info("Model saved successfully.")

if __name__ == "__main__":
    main()