[#700] Update case import process segmentation logic with correct thresholds for each segment

wayangalihpratama · wayangalihpratama · commit 86d225ded4e8 · 2026-01-23T15:35:31.000+08:00
diff --git a/backend/utils/case_import_process_confirmed_segmentation.py b/backend/utils/case_import_process_confirmed_segmentation.py
@@ -5,7 +5,7 @@
 from io import BytesIO
 
 from models.question import Question
-from models.case_commodity import CaseCommodity
+from models.case_commodity import CaseCommodity, CaseCommodityType
 from models.segment import SegmentUpdateBase, SegmentAnswerBase
 from utils.case_import_storage import load_import_file
 from db.crud_case_import import get_case_import
@@ -25,10 +25,7 @@ def resolve_question(
         q = session.get(Question, int(question_id))
         if q:
             return q
-        raise HTTPException(
-            status_code=400,
-            detail=f"Question not found for id {question_id}",
-        )
+        raise HTTPException(400, f"Question not found for id {question_id}")
 
     if public_key:
         q = (
@@ -39,34 +36,29 @@ def resolve_question(
         if q:
             return q
         raise HTTPException(
-            status_code=400,
-            detail=f"Question not found for public_key '{public_key}'",
+            400, f"Question not found for public_key '{public_key}'"
         )
 
     raise HTTPException(
-        status_code=400,
-        detail="Mapping row must contain either id or public_key",
+        400, "Mapping row must contain either id or public_key"
     )
 
 
 # --------------------------------------------------
-# Main processor
+# Main processor (FINAL FIX)
 # --------------------------------------------------
 def process_confirmed_segmentation(
     *,
     payload,
     session,
 ) -> Dict[str, Any]:
 
-    # --------------------------------------------------
-    # 1. Inputs
-    # --------------------------------------------------
     case_id = payload.case_id
     segmentation_variable = payload.segmentation_variable.strip().lower()
     segments = sorted(payload.segments, key=lambda s: s.index)
 
     # --------------------------------------------------
-    # 2. Load import file (bytes)
+    # Load import file
     # --------------------------------------------------
     case_import = get_case_import(session=session, import_id=payload.import_id)
     content = load_import_file(case_import.file_path)
@@ -76,195 +68,147 @@ def process_confirmed_segmentation(
         data_df = pd.read_excel(xls, sheet_name="data")
         mapping_df = pd.read_excel(xls, sheet_name="mapping")
 
-        # normalize column names
         data_df.columns = data_df.columns.str.strip().str.lower()
         mapping_df.columns = mapping_df.columns.str.strip().str.lower()
     except Exception:
-        raise HTTPException(
-            status_code=400,
-            detail="Failed to read import workbook",
-        )
+        raise HTTPException(400, "Failed to read import workbook")
 
     if segmentation_variable not in data_df.columns:
         raise HTTPException(
-            status_code=400,
-            detail=f"Segmentation variable {segmentation_variable} not found",
+            400, f"Segmentation variable '{segmentation_variable}' not found"
         )
 
     # --------------------------------------------------
-    # 3. Validate mapping sheet
+    # Prepare segmentation series
     # --------------------------------------------------
-    if "variable_name" not in mapping_df.columns:
-        raise HTTPException(
-            status_code=400,
-            detail="Mapping sheet must contain 'variable_name'",
-        )
-
-    if not {"id", "public_key"} & set(mapping_df.columns):
-        raise HTTPException(
-            status_code=400,
-            detail="Mapping sheet must contain at least 'id' or 'public_key'",
-        )
-
-    # Only variables explicitly mapped are output drivers
-    output_variables = (
-        mapping_df["variable_name"].dropna().str.lower().tolist()
-    )
-
-    # --------------------------------------------------
-    # 4. Segment assignment (UI AUTHORITATIVE)
-    # --------------------------------------------------
-    series = data_df[segmentation_variable]
-    is_numeric = pd.api.types.is_numeric_dtype(series)
+    seg_series = data_df[segmentation_variable]
+    is_numeric = pd.api.types.is_numeric_dtype(seg_series)
 
     if not is_numeric:
-        series = series.astype(str).str.strip().str.lower()
-        category_map = {
-            str(seg.value).strip().lower(): seg.name for seg in segments
-        }
-
-    def assign_segment(value):
-        if pd.isna(value):
-            return None
-
-        # ---------- NUMERIC ----------
-        if is_numeric:
-            prev = None
-            for seg in segments:
-                bound = float(seg.value)
-                if prev is None and value <= bound:
-                    return seg.name
-                if prev is not None and prev < value <= bound:
-                    return seg.name
-                prev = bound
-            return segments[-1].name  # last open segment
-
-        # ---------- CATEGORICAL ----------
-        return category_map.get(str(value).strip().lower())
-
-    data_df["_segment"] = series.apply(assign_segment)
+        seg_series = seg_series.astype(str).str.strip().str.lower()
 
     # --------------------------------------------------
-    # 5. Aggregate statistics per segment
+    # Resolve Case Commodity Levels (FIXED)
     # --------------------------------------------------
-    aggregated: Dict[str, Dict[str, Dict[str, float]]] = {}
-
-    for var in output_variables:
-        if var not in data_df.columns:
-            continue
-        if not pd.api.types.is_numeric_dtype(data_df[var]):
-            continue
-
-        stats = (
-            data_df.groupby("_segment")[var]
-            .agg(
-                current="median",
-                feasible=lambda x: x.quantile(0.9),
-            )
-            .dropna()
-            .to_dict(orient="index")
-        )
-
-        aggregated[var] = stats
-
-    # --------------------------------------------------
-    # 6. Build Segment + SegmentAnswer payload
-    # --------------------------------------------------
-    segments_payload = []
-
-    # Generate case commodity value
     case_commodities = (
         session.query(CaseCommodity)
         .filter(CaseCommodity.case == case_id)
         .all()
     )
-    case_commodities = [
-        cm.simplify_with_case_commodity_level for cm in case_commodities
-    ]
-    case_commodity_levels = {}
-    case_commodity_breakdowns = {}
+
+    commodity_level_map = {}
     for cc in case_commodities:
-        key = f"{cc['commodity_type']}"
-        case_commodity_levels[key] = cc["id"]
-        case_commodity_breakdowns[key] = cc["breakdown"]
-    # eol case commodity
+        if cc.commodity_type == CaseCommodityType.focus:
+            commodity_level_map["primary"] = cc.id
+        else:
+            commodity_level_map[cc.commodity_type.value] = cc.id
 
-    for seg in segments:
-        seg_name = seg.name
+    # --------------------------------------------------
+    # Process segments with BOUNDARY FILTERING
+    # --------------------------------------------------
+    segment_payloads = []
+
+    for idx, seg in enumerate(segments):
         seg_id = seg.id
+        seg_name = seg.name
+
+        # ---------- APPLY SEGMENT FILTER ----------
+        if is_numeric:
+            lower = float(segments[idx - 1].value) if idx > 0 else None
+            upper = float(seg.value)
+
+            if lower is None:
+                mask = seg_series <= upper
+            else:
+                mask = (seg_series > lower) & (seg_series <= upper)
+        else:
+            mask = seg_series == str(seg.value).strip().lower()
 
-        seg_df = data_df[data_df["_segment"] == seg_name]
+        seg_df = data_df[mask]
         number_of_farmers = int(len(seg_df))
 
-        seg_answers = []
+        answers = []
 
+        # ---------- PER-SEGMENT AGGREGATION ----------
         for _, row in mapping_df.iterrows():
-            var = str(row["variable_name"]).lower()
+            raw_id = row.get("id", None)
+            level = None
+            qid = None
 
-            if var not in aggregated:
+            if not raw_id:
                 continue
-            if seg_name not in aggregated[var]:
+
+            var = str(row["variable_name"]).strip().lower()
+            if var not in seg_df.columns:
                 continue
 
-            questionID = row.get("id")
-            [qLevel, qID] = (
-                questionID.split("-")
-                if questionID and "-" in questionID
-                else [None, None]
-            )
+            if not pd.api.types.is_numeric_dtype(seg_df[var]):
+                continue
+
+            values = seg_df[var].dropna()
+            if values.empty:
+                continue
+
+            current_value = float(values.median())
+            feasible_value = float(values.quantile(0.9))
+
+            if raw_id and "-" in str(raw_id):
+                level, qid = raw_id.split("-", 1)
+                level = level.lower()
+            else:
+                qid = raw_id
+                # Default commodity level when mapping id has no prefix
+                level = CaseCommodityType.diversified.value
+
+            # check for case_commodity_id
+            case_commodity_id = commodity_level_map.get(level)
+            if not case_commodity_id:
+                raise HTTPException(
+                    status_code=400,
+                    detail=(
+                        f"Case commodity not found for level '{level}' (mapping id: {raw_id})"  # noqa
+                    ),
+                )
+
             question = resolve_question(
                 session=session,
-                question_id=qID,
+                question_id=qid,
                 public_key=row.get("public_key"),
             )
 
-            stats = aggregated[var][seg_name]
-
-            # primary / secondary / tertiary
-            case_commodity_id = case_commodity_levels.get(qLevel)
-            # build SegmentAnswer
-            payload = SegmentAnswerBase(
-                case_commodity=case_commodity_id,
-                segment=seg_id,
-                question=question.id,
-                current_value=float(stats["current"]),
-                feasible_value=float(stats["feasible"]),
+            answers.append(
+                SegmentAnswerBase(
+                    case_commodity=case_commodity_id,
+                    segment=seg_id,
+                    question=question.id,
+                    current_value=current_value,
+                    feasible_value=feasible_value,
+                )
             )
-            seg_answers.append(payload)
 
-        segments_payload.append(
+        segment_payloads.append(
             SegmentUpdateBase(
                 id=seg_id,
                 name=seg_name,
                 case=case_id,
                 number_of_farmers=number_of_farmers,
-                answers=seg_answers,
+                answers=answers,
             )
         )
 
-    # Save segment answers and update segment number_of_farmers
-    update_segment(
-        session=session,
-        payloads=segments_payload,
-    )
-
     # --------------------------------------------------
-    # 7. Cleanup
+    # Persist
     # --------------------------------------------------
+    update_segment(session=session, payloads=segment_payloads)
+
     try:
-        REMOVE = False
-        if REMOVE:
-            os.remove(case_import.file_path)
+        os.remove(case_import.file_path)
     except Exception:
         pass
 
-    # --------------------------------------------------
-    # 8. Response
-    # --------------------------------------------------
     return {
         "status": "success",
         "case_id": case_id,
-        "segments": segments_payload,
-        "total_segments": len(segments_payload),
-        "drivers_processed": len(aggregated),
+        "segments": segment_payloads,
+        "total_segments": len(segment_payloads),
     }