[#723] Fix segmentation floating point precision issues

wayangalihpratama · wayangalihpratama · commit 48515852eea5 · 2026-02-19T16:34:25.000+08:00
diff --git a/GEMINI.md b/GEMINI.md
@@ -196,6 +196,8 @@ Income Driver Calculator (IDC) is a web application designed to help companies t
     - Prevented creation of "artificial" thresholds in data gaps, eliminating empty segments with 0 farmers.
     - Implemented support for "Equal Interval" strategy in backend segmentation logic.
     - Updated Pydantic models to support optional strategy selection in segmentation previews and recalculations.
+    - Resolved floating point precision issues where values slightly exceeding cut thresholds (e.g., `1.95` stored as `1.9500001`) were excluded from segments by rounding input data to 2 decimal places.
+    - Added regression test `test_segmentation_repro.py` to verify boundary handling.
 - **Technical Improvements & Workflows**:
     - **General Refactoring**: Split the monolithic `idc-antigravity-skills` into granular components: `idc-core`, `idc-database`, and `idc-testing`.
     - **Workflows**: Updated `check_time`, `commit_changes`, and `create_pr` workflows with automated branch detection; implemented `seed_data`, `view_logs`, and `run_frontend_test`.
diff --git a/backend/tests/test_segmentation_repro.py b/backend/tests/test_segmentation_repro.py
@@ -0,0 +1,43 @@
+import numpy as np
+import pandas as pd
+from utils.case_import_processing import calculate_numerical_segments_from_cuts
+
+
+def test_segmentation_floating_point_boundary():
+    # Issue: Data has 1.95 + epsilon, but cut is 1.95.
+    # Expectation: 1.95+epsilon should be included in
+    # the 1.95 bucket if it's "close enough"
+    # or if we round data before bucketing.
+
+    epsilon = 2.3e-16  # approximate value from debug output
+    val = 1.95 + epsilon
+
+    # Verify that val is strictly greater than 1.95
+    assert val > 1.95
+
+    df = pd.DataFrame({"land": [val]})
+    cuts = np.array([1.95])
+
+    segments = calculate_numerical_segments_from_cuts(df, "land", cuts)
+
+    # We expect 1 segment with 1 farmer.
+    # Currently, because val > 1.95, digtize returns index
+    # 1 (buckets are 0-based index from cuts?)
+    # cuts=[1.95].
+    # bins[0] = 1.95.
+    # digitize returns i such that bins[i-1] < x <= bins[i].
+    # If x > bins[-1], returns len(bins).
+    # Here len(cuts)=1. Returns 1.
+
+    # If returns 1, it means it's beyond the last cut.
+    # segments loop iterates zip(cuts, counts).
+    # If counts has length 2 (index 0 and 1), and cuts has length 1.
+    # zip stops after 1 iteration.
+    # So index 1 count is IGNORED.
+
+    # We want the count to be in index 0.
+    # So we want digitize to return 0.
+    # i=0 means x <= bins[0].
+
+    assert len(segments) == 1
+    assert segments[0]["number_of_farmers"] == 1
diff --git a/backend/utils/case_import_processing.py b/backend/utils/case_import_processing.py
@@ -94,7 +94,8 @@ def generate_numerical_cut_values(
         cuts = np.linspace(np.min(series), np.max(series), n_segments + 1)[1:]
     else:
         # Equal-frequency cuts via quantiles
-        # method='closest_observation' ensures thresholds are actual data points
+        # method='closest_observation' ensures
+        # thresholds are actual data points
         quantiles = np.linspace(0, 1, n_segments + 1)[1:]
         cuts = np.quantile(
             series,
@@ -138,6 +139,11 @@ def calculate_numerical_segments_from_cuts(
         - max: upper bound of the segment range
     """
     values = df[column].dropna().to_numpy()
+    # Round values to 2 decimals to match the precision of cuts
+    # (which are also rounded to 2 decimals)
+    # This prevents floating point errors where e.g. 1.9500000001
+    # falls into the >1.95 bucket
+    values = np.round(values, 2)
 
     # Assign bucket indices
     bucket_idx = np.digitize(values, bins=cuts, right=True)
@@ -263,6 +269,8 @@ def recalculate_numerical_segments(
             )
 
         series = df[seg_var].dropna().to_numpy()
+        # Round values to 2 decimals to match the precision of cuts
+        series = np.round(series, 2)
         is_numeric = pd.api.types.is_numeric_dtype(df[seg_var])
         is_integer_data = is_numeric and np.all(np.mod(series, 1) == 0)