zee404-code
diff --git a/‎.dockerignore‎
Lines changed: 0 additions & 16 deletions b/‎.dockerignore‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/processed/test_set.csv‎
Lines changed: 2582 additions & 2582 deletions b/‎data/processed/test_set.csv‎
Lines changed: 2582 additions & 2582 deletions
diff --git a/‎data/processed/train_set.csv‎
Lines changed: 10325 additions & 10325 deletions b/‎data/processed/train_set.csv‎
Lines changed: 10325 additions & 10325 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 4 additions & 4 deletions b/‎docker-compose.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎mlflow.db‎
440 KB b/‎mlflow.db‎
440 KB
diff --git a/‎models/model.joblib‎
4.42 MB b/‎models/model.joblib‎
4.42 MB
diff --git a/‎models/scaler.joblib‎
1.14 KB b/‎models/scaler.joblib‎
1.14 KB
diff --git a/‎src/app/main.py‎
Lines changed: 42 additions & 6 deletions b/‎src/app/main.py‎
Lines changed: 42 additions & 6 deletions
diff --git a/‎src/ingest.py‎
Lines changed: 21 additions & 23 deletions b/‎src/ingest.py‎
Lines changed: 21 additions & 23 deletions
@@ -3,7 +3,7 @@
 faiss_index/
 mlruns/
 mlartifacts/
-my_model/
+models/hf_cache/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
 
@@ -16,15 +16,15 @@ services:
       - ./src:/app/src
       - ./models:/app/models
       - ./faiss_index:/app/faiss_index
-      - ./my_model:/app/my_model
     dns:
       - 8.8.8.8
       - 8.8.4.4
     environment:
       - GROQ_API_KEY=${GROQ_API_KEY}
-      - EMBED_MODEL_PATH=/app/my_model
-      - HF_HUB_OFFLINE=1
-      - TRANSFORMERS_OFFLINE=1
+      - HF_HOME=./models/hf_cache
+      - EMBED_MODEL_PATH=sentence-transformers/all-MiniLM-L6-v2
+      - HF_HUB_OFFLINE=0
+      - TRANSFORMERS_OFFLINE=0
     networks:
       - monitoring-net
 
 
@@ -4,7 +4,6 @@
 import json
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, ConfigDict
-import numpy as np
 import os
 from typing import List
 from dotenv import load_dotenv
@@ -66,6 +65,14 @@ def ask_rag(query):
     print("Error: model_columns.json not found.")
     model_columns = []
 
+# Load scaler
+try:
+    scaler = joblib.load("models/scaler.joblib")
+    print("Scaler loaded successfully.")
+except FileNotFoundError:
+    print("Error: scaler.joblib not found.")
+    scaler = None
+
 
 # Define Input Data Shape (Pydantic BaseModel) ---
 class ProductFeatures(BaseModel):
@@ -143,7 +150,15 @@ def health():
 
 @app.post("/predict", response_model=PredictionOut)
 def predict(features: ProductFeatures):
+    if model is None or not model_columns or scaler is None:
+        raise HTTPException(
+            status_code=500, detail="Model, columns, or scaler not loaded."
+        )
+
+    # Convert Pydantic object → dict
     data_dict = features.model_dump()
+
+    # Rename to match training columns
     data_dict_renamed = {
         "Original Price": data_dict["Original_Price"],
         "Discount Price": data_dict["Discount_Price"],
@@ -156,17 +171,38 @@ def predict(features: ProductFeatures):
         "Delivery Type": data_dict["Delivery_Type"],
         "Flagship Store": data_dict["Flagship_Store"],
     }
+
+    # Make a DataFrame
     input_df = pd.DataFrame([data_dict_renamed])
+
+    # --- SCALE NUMERIC FEATURES (NEW) ---
+    numeric_features = [
+        "Original Price",
+        "Discount Price",
+        "Number of Ratings",
+        "Positive Seller Ratings",
+        "Ship On Time",
+        "Chat Response Rate",
+        "No. of products to be sold",
+    ]
+
+    input_df[numeric_features] = scaler.transform(input_df[numeric_features])
+
+    # --- ENCODE CATEGORICALS ---
     input_df_encoded = pd.get_dummies(input_df, drop_first=True)
+
+    # --- ALIGN COLUMNS ---
     input_df_aligned = input_df_encoded.reindex(columns=model_columns, fill_value=0)
 
-    if model is None or not model_columns:
-        return {"error": "Model or columns not loaded."}
+    # --- PREDICT ---
+    raw_pred = model.predict(input_df_aligned)[0]
+
+    # Clip output to between 1–100
+    prediction = float(raw_pred)
 
-    prediction = np.clip(model.predict(input_df_aligned)[0], 1, 100)
-    observe_prediction()
+    observe_prediction()  # D4 metric
 
-    return {"predicted_success_score": float(prediction)}
+    return {"predicted_success_score": prediction}
 
 
 # D2 RAG Chatbot Endpoint (Updated with Guardrails)
 
@@ -6,47 +6,45 @@
 from llama_index.llms.groq import Groq
 from llama_index.core.node_parser import SentenceSplitter
 
-# 1. Setup
-print("Starting Ingestion...")
-Settings.embed_model = HuggingFaceEmbedding(
-    model_name="sentence-transformers/all-MiniLM-L6-v2"
+# 1. Setup Hugging Face cache & Embedding
+hf_cache_dir = os.getenv("HF_HOME", "models/hf_cache")
+os.makedirs(hf_cache_dir, exist_ok=True)
+
+embed_model_name = os.getenv(
+    "EMBED_MODEL_PATH", "sentence-transformers/all-MiniLM-L6-v2"
 )
+print(f"Using embedding model: {embed_model_name}")
+
+Settings.embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
 Settings.llm = Groq(model="llama-3.1-8b-instant", api_key=os.getenv("GROQ_API_KEY"))
 
-# 2. Load Data
-# CHECK THIS PATH: Make sure your CSV is actually inside data/raw/
+# 2. Load CSV
 file_path = os.path.join("data", "raw", "daraz-code-mixed-product-reviews.csv")
-
 if not os.path.exists(file_path):
-    print(f" Error: File not found at {file_path}")
-    print("   Please move your CSV to 'data/raw/' or update the path.")
-    exit()
+    raise FileNotFoundError(
+        f"CSV not found at {file_path}. Please move it there or update the path."
+    )
 
 df = pd.read_csv(file_path)
 documents = []
 
-print(f"   Processing {len(df)} rows...")
+print(f"Processing {len(df)} rows...")
 for _, row in df.iterrows():
-    review_text = str(row["Reviews"])  # Ensure string
+    review_text = str(row["Reviews"])
     sentiment_value = str(row.get("Sentiments", "unknown"))
 
     text = f"Review: {review_text}\nSentiment: {sentiment_value}"
-
-    # Metadata helps the LLM filter if needed
-    metadata = {
-        "sentiment": sentiment_value,
-    }
+    metadata = {"sentiment": sentiment_value}
     documents.append(Document(text=text, metadata=metadata))
 
 # 3. Chunking
 parser = SentenceSplitter(chunk_size=512, chunk_overlap=50)
 nodes = parser.get_nodes_from_documents(documents)
 
-# 4. Build & Save (Default LlamaIndex Storage - Compatible with your query.py)
-print("Building Index...")
+# 4. Build & Save Index
 index = VectorStoreIndex(nodes, show_progress=True)
+faiss_dir = "faiss_index"
+os.makedirs(faiss_dir, exist_ok=True)
+index.storage_context.persist(persist_dir=faiss_dir)
 
-print("Saving to 'faiss_index'...")
-index.storage_context.persist(persist_dir="faiss_index")
-
-print("SUCCESS: Index built! Now run 'make run' or 'python main.py'")
+print(f"SUCCESS: Index built and saved to '{faiss_dir}'")