diff --git a/Diabetes Prediction [END 2 END]/diabetes_pipeline/.gitignore b/Diabetes Prediction [END 2 END]/diabetes_pipeline/.gitignore new file mode 100644 index 0000000..4ec23b6 --- /dev/null +++ b/Diabetes Prediction [END 2 END]/diabetes_pipeline/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +*.pyc +model/*.pkl +.venv/ diff --git a/Diabetes Prediction [END 2 END]/diabetes_pipeline/README.md b/Diabetes Prediction [END 2 END]/diabetes_pipeline/README.md new file mode 100644 index 0000000..c43b6d4 --- /dev/null +++ b/Diabetes Prediction [END 2 END]/diabetes_pipeline/README.md @@ -0,0 +1,152 @@ +# Diabetes Prediction – Machine Learning Pipeline + +> ⚠️ This repository is a **forked project**. +> The work below represents my **independent contribution and extension** to the original codebase. + +This project implements a complete **end-to-end machine learning pipeline** for predicting diabetes using the Pima Indians Diabetes dataset. +The pipeline covers **data preprocessing, model training, evaluation, experimentation, and inference via CLI**. + +--- + +## 📁 Project Structure +diabetes_pipeline/ +│ +├── dataset/ +│ └── kaggle_diabetes.csv +│ +├── model/ +│ ├── diabetes_model.pkl +│ └── scaler.pkl +│ +├── experiments/ +│ └── experiment_runner.py +│ +├── data_preprocessing.py +├── train.py +├── predict.py +├── evaluate.py +└── README.md + +--- + +## 🚀 My Contributions + +I independently designed and implemented the following components: + +### 1. Data Preprocessing Pipeline +- Handled missing values in medical features: + - `Glucose`, `BloodPressure`, `SkinThickness`, `Insulin`, `BMI` +- Replaced invalid zeros with `NaN` +- Applied **mean / median imputation** +- Standardized features using `StandardScaler` +- Ensured consistent feature names across training and inference + +📄 `data_preprocessing.py` + +--- + +### 2. Model Training +- Implemented a reproducible training pipeline +- Trained and persisted: + - Random Forest classifier + - Feature scaler +- Stored trained artifacts for reuse and deployment + +📄 `train.py` + +--- + +### 3. Model Evaluation +- Added evaluation logic with: + - Accuracy + - Precision, Recall, F1-score +- Verified generalization on the test set + +📄 `evaluate.py` + +--- + +### 4. Experimentation Framework +- Benchmarked multiple ML models: + - Logistic Regression + - Decision Tree + - Random Forest + - Support Vector Machine (SVM) +- Automatically reports accuracy and F1-score + +📄 `experiments/experiment_runner.py` + +#### Sample Results + +| Model | Accuracy | F1 Score | +|----------------------|----------|----------| +| Logistic Regression | 0.7875 | 0.6320 | +| Decision Tree | 0.9875 | 0.9805 | +| Random Forest | 0.9950 | 0.9921 | +| SVM | 0.8450 | 0.7328 | + +✔️ **Random Forest performs best on this dataset** + +--- + +### 5. Command-Line Prediction Interface +- Built a CLI-based inference script +- Ensures: + - Correct feature order + - Feature-name alignment with trained scaler +- Predicts diabetes for a single patient input + +📄 `predict.py` + +Example: +```bash +python predict.py \ + --pregnancies 2 \ + --glucose 120 \ + --bp 70 \ + --skin 20 \ + --insulin 80 \ + --bmi 25 \ + --dpf 0.5 \ + --age 35 + + + +--- + +## 🛠️ Tech Stack + +- Python 3.10+ +- pandas +- numpy +- scikit-learn +- joblib + +--- + +## 🧩 Notes + +- Project is modular and deployment-ready +- Structured to support FastAPI / Flask integration +- Generated files cleaned using `.gitignore` +- Suitable for internship-level ML engineering evaluation + +--- + +## 👩‍💻 Author Contribution + +**Contributor:** Tandrita Mukherjee + +**Contribution Scope:** +- ML pipeline design +- Data preprocessing +- Model training & evaluation +- Experimentation framework +- CLI-based inference system + +--- + +## 📌 Disclaimer + +This repository is a fork of an existing project. +All enhancements, restructuring, and ML pipeline components listed above were implemented independently as part of my learning and internship preparation. diff --git a/Diabetes Prediction [END 2 END]/diabetes_pipeline/config.py b/Diabetes Prediction [END 2 END]/diabetes_pipeline/config.py new file mode 100644 index 0000000..47431de --- /dev/null +++ b/Diabetes Prediction [END 2 END]/diabetes_pipeline/config.py @@ -0,0 +1,7 @@ +from pathlib import Path + +BASE_DIR = Path(__file__).resolve().parent + +MODEL_DIR = BASE_DIR / "model" +MODEL_PATH = MODEL_DIR / "diabetes_model.pkl" +SCALER_PATH = MODEL_DIR / "scaler.pkl" diff --git a/Diabetes Prediction [END 2 END]/diabetes_pipeline/data_preprocessing.py b/Diabetes Prediction [END 2 END]/diabetes_pipeline/data_preprocessing.py new file mode 100644 index 0000000..4c5f805 --- /dev/null +++ b/Diabetes Prediction [END 2 END]/diabetes_pipeline/data_preprocessing.py @@ -0,0 +1,29 @@ +# diabetes_pipeline/data_preprocessing.py + +import pandas as pd +import numpy as np +from pathlib import Path +from sklearn.model_selection import train_test_split + +def load_and_preprocess(test_size=0.2, random_state=0): + BASE_DIR = Path(__file__).resolve().parent + csv_path = BASE_DIR / "dataset" / "kaggle_diabetes.csv" + df = pd.read_csv(csv_path) + + df = df.rename(columns={'DiabetesPedigreeFunction': 'DPF'}) + + cols_with_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI'] + df[cols_with_zero] = df[cols_with_zero].replace(0, np.nan) + + df['Glucose'] = df['Glucose'].fillna(df['Glucose'].mean()) + df['BloodPressure'] = df['BloodPressure'].fillna(df['BloodPressure'].mean()) + df['SkinThickness'] = df['SkinThickness'].fillna(df['SkinThickness'].median()) + df['Insulin'] = df['Insulin'].fillna(df['Insulin'].median()) + df['BMI'] = df['BMI'].fillna(df['BMI'].median()) + + X = df.drop(columns='Outcome') + y = df['Outcome'] + + return train_test_split( + X, y, test_size=test_size, random_state=random_state + ) diff --git a/Diabetes Prediction [END 2 END]/dataset/kaggle_diabetes.csv b/Diabetes Prediction [END 2 END]/diabetes_pipeline/dataset/kaggle_diabetes.csv similarity index 100% rename from Diabetes Prediction [END 2 END]/dataset/kaggle_diabetes.csv rename to Diabetes Prediction [END 2 END]/diabetes_pipeline/dataset/kaggle_diabetes.csv diff --git a/Diabetes Prediction [END 2 END]/diabetes_pipeline/evaluate.py b/Diabetes Prediction [END 2 END]/diabetes_pipeline/evaluate.py new file mode 100644 index 0000000..e8e8bd8 --- /dev/null +++ b/Diabetes Prediction [END 2 END]/diabetes_pipeline/evaluate.py @@ -0,0 +1,20 @@ +import joblib +from sklearn.metrics import accuracy_score, classification_report +from data_preprocessing import load_and_preprocess +from config import MODEL_PATH + +# Load data +X_train, X_test, y_train, y_test, _ = load_and_preprocess() + +# Load trained model +model = joblib.load(MODEL_PATH) + +# Predict +y_pred = model.predict(X_test) + +# Metrics +accuracy = accuracy_score(y_test, y_pred) +report = classification_report(y_test, y_pred) + +print("Accuracy:", accuracy) +print("\nClassification Report:\n", report) diff --git a/Diabetes Prediction [END 2 END]/diabetes_pipeline/experiments/__init__.py b/Diabetes Prediction [END 2 END]/diabetes_pipeline/experiments/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Diabetes Prediction [END 2 END]/diabetes_pipeline/experiments/experiment_runner.py b/Diabetes Prediction [END 2 END]/diabetes_pipeline/experiments/experiment_runner.py new file mode 100644 index 0000000..afcf9a2 --- /dev/null +++ b/Diabetes Prediction [END 2 END]/diabetes_pipeline/experiments/experiment_runner.py @@ -0,0 +1,43 @@ +# diabetes_pipeline/experiments/experiment_runner.py + +import pandas as pd +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.svm import SVC +from sklearn.metrics import accuracy_score, f1_score + +from diabetes_pipeline.data_preprocessing import load_and_preprocess + +X_train, X_test, y_train, y_test = load_and_preprocess() + +models = { + "LogisticRegression": LogisticRegression(max_iter=1000), + "DecisionTree": DecisionTreeClassifier(random_state=0), + "RandomForest": RandomForestClassifier(n_estimators=50, random_state=0), + "SVM": SVC() +} + +results = [] + +for name, model in models.items(): + pipeline = Pipeline([ + ("scaler", StandardScaler()), + ("model", model) + ]) + + pipeline.fit(X_train, y_train) + preds = pipeline.predict(X_test) + + results.append({ + "Model": name, + "Accuracy": accuracy_score(y_test, preds), + "F1 Score": f1_score(y_test, preds) + }) + +df = pd.DataFrame(results) +print(df) + +df.to_csv("diabetes_pipeline/experiments/results.csv", index=False) diff --git a/Diabetes Prediction [END 2 END]/diabetes_pipeline/experiments/results.csv b/Diabetes Prediction [END 2 END]/diabetes_pipeline/experiments/results.csv new file mode 100644 index 0000000..6c2e471 --- /dev/null +++ b/Diabetes Prediction [END 2 END]/diabetes_pipeline/experiments/results.csv @@ -0,0 +1,5 @@ +Model,Accuracy,F1 Score +LogisticRegression,0.7875,0.6320346320346321 +DecisionTree,0.9875,0.980544747081712 +RandomForest,0.995,0.9921259842519685 +SVM,0.845,0.7327586206896551 diff --git a/Diabetes Prediction [END 2 END]/diabetes_pipeline/logs/training.log b/Diabetes Prediction [END 2 END]/diabetes_pipeline/logs/training.log new file mode 100644 index 0000000..53025b1 --- /dev/null +++ b/Diabetes Prediction [END 2 END]/diabetes_pipeline/logs/training.log @@ -0,0 +1,4 @@ +2025-12-28 11:48:56,518 - INFO - Training started +2025-12-28 11:48:56,641 - INFO - Model and scaler saved successfully +2025-12-28 11:49:14,730 - INFO - Training started +2025-12-28 11:49:14,821 - INFO - Model and scaler saved successfully diff --git a/Diabetes Prediction [END 2 END]/diabetes_pipeline/predict.py b/Diabetes Prediction [END 2 END]/diabetes_pipeline/predict.py new file mode 100644 index 0000000..b83a64e --- /dev/null +++ b/Diabetes Prediction [END 2 END]/diabetes_pipeline/predict.py @@ -0,0 +1,45 @@ +import argparse +import joblib +import pandas as pd + +MODEL_PATH = "model/diabetes_model.pkl" +SCALER_PATH = "model/scaler.pkl" + +parser = argparse.ArgumentParser() +parser.add_argument("--pregnancies", type=int, required=True) +parser.add_argument("--glucose", type=float, required=True) +parser.add_argument("--bp", type=float, required=True) +parser.add_argument("--skin", type=float, required=True) +parser.add_argument("--insulin", type=float, required=True) +parser.add_argument("--bmi", type=float, required=True) +parser.add_argument("--dpf", type=float, required=True) +parser.add_argument("--age", type=int, required=True) + +args = parser.parse_args() + +# Load model & scaler +model = joblib.load(MODEL_PATH) +scaler = joblib.load(SCALER_PATH) + +# IMPORTANT: feature names must match training +input_data = pd.DataFrame([{ + "Pregnancies": args.pregnancies, + "Glucose": args.glucose, + "BloodPressure": args.bp, + "SkinThickness": args.skin, + "Insulin": args.insulin, + "BMI": args.bmi, + "DPF": args.dpf, + "Age": args.age +}]) + +# Scale & predict +input_scaled = scaler.transform(input_data) +prediction = model.predict(input_scaled)[0] + +if prediction == 1: + print("⚠️ Diabetes detected") +else: + print("✅ No diabetes detected") + + diff --git a/Diabetes Prediction [END 2 END]/diabetes_pipeline/train.py b/Diabetes Prediction [END 2 END]/diabetes_pipeline/train.py new file mode 100644 index 0000000..c84e53c --- /dev/null +++ b/Diabetes Prediction [END 2 END]/diabetes_pipeline/train.py @@ -0,0 +1,29 @@ +import logging +import joblib +from sklearn.ensemble import RandomForestClassifier +from data_preprocessing import load_and_preprocess +from config import MODEL_PATH, SCALER_PATH, MODEL_DIR + +# Logging setup +logging.basicConfig( + filename="logs/training.log", + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" +) + +logging.info("Training started") + +# Load data +X_train, X_test, y_train, y_test, scaler = load_and_preprocess() + +# Train model +classifier = RandomForestClassifier(n_estimators=20, random_state=0) +classifier.fit(X_train, y_train) + +# Save artifacts +MODEL_DIR.mkdir(exist_ok=True) +joblib.dump(classifier, MODEL_PATH) +joblib.dump(scaler, SCALER_PATH) + +logging.info("Model and scaler saved successfully") +