Skip to content

Commit 4851585

Browse files
[#723] Fix segmentation floating point precision issues
1 parent 0969179 commit 4851585

File tree

3 files changed

+54
-1
lines changed

3 files changed

+54
-1
lines changed

GEMINI.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,8 @@ Income Driver Calculator (IDC) is a web application designed to help companies t
196196
- Prevented creation of "artificial" thresholds in data gaps, eliminating empty segments with 0 farmers.
197197
- Implemented support for "Equal Interval" strategy in backend segmentation logic.
198198
- Updated Pydantic models to support optional strategy selection in segmentation previews and recalculations.
199+
- Resolved floating point precision issues where values slightly exceeding cut thresholds (e.g., `1.95` stored as `1.9500001`) were excluded from segments by rounding input data to 2 decimal places.
200+
- Added regression test `test_segmentation_repro.py` to verify boundary handling.
199201
- **Technical Improvements & Workflows**:
200202
- **General Refactoring**: Split the monolithic `idc-antigravity-skills` into granular components: `idc-core`, `idc-database`, and `idc-testing`.
201203
- **Workflows**: Updated `check_time`, `commit_changes`, and `create_pr` workflows with automated branch detection; implemented `seed_data`, `view_logs`, and `run_frontend_test`.
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import numpy as np
2+
import pandas as pd
3+
from utils.case_import_processing import calculate_numerical_segments_from_cuts
4+
5+
6+
def test_segmentation_floating_point_boundary():
7+
# Issue: Data has 1.95 + epsilon, but cut is 1.95.
8+
# Expectation: 1.95+epsilon should be included in
9+
# the 1.95 bucket if it's "close enough"
10+
# or if we round data before bucketing.
11+
12+
epsilon = 2.3e-16 # approximate value from debug output
13+
val = 1.95 + epsilon
14+
15+
# Verify that val is strictly greater than 1.95
16+
assert val > 1.95
17+
18+
df = pd.DataFrame({"land": [val]})
19+
cuts = np.array([1.95])
20+
21+
segments = calculate_numerical_segments_from_cuts(df, "land", cuts)
22+
23+
# We expect 1 segment with 1 farmer.
24+
# Currently, because val > 1.95, digtize returns index
25+
# 1 (buckets are 0-based index from cuts?)
26+
# cuts=[1.95].
27+
# bins[0] = 1.95.
28+
# digitize returns i such that bins[i-1] < x <= bins[i].
29+
# If x > bins[-1], returns len(bins).
30+
# Here len(cuts)=1. Returns 1.
31+
32+
# If returns 1, it means it's beyond the last cut.
33+
# segments loop iterates zip(cuts, counts).
34+
# If counts has length 2 (index 0 and 1), and cuts has length 1.
35+
# zip stops after 1 iteration.
36+
# So index 1 count is IGNORED.
37+
38+
# We want the count to be in index 0.
39+
# So we want digitize to return 0.
40+
# i=0 means x <= bins[0].
41+
42+
assert len(segments) == 1
43+
assert segments[0]["number_of_farmers"] == 1

backend/utils/case_import_processing.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ def generate_numerical_cut_values(
9494
cuts = np.linspace(np.min(series), np.max(series), n_segments + 1)[1:]
9595
else:
9696
# Equal-frequency cuts via quantiles
97-
# method='closest_observation' ensures thresholds are actual data points
97+
# method='closest_observation' ensures
98+
# thresholds are actual data points
9899
quantiles = np.linspace(0, 1, n_segments + 1)[1:]
99100
cuts = np.quantile(
100101
series,
@@ -138,6 +139,11 @@ def calculate_numerical_segments_from_cuts(
138139
- max: upper bound of the segment range
139140
"""
140141
values = df[column].dropna().to_numpy()
142+
# Round values to 2 decimals to match the precision of cuts
143+
# (which are also rounded to 2 decimals)
144+
# This prevents floating point errors where e.g. 1.9500000001
145+
# falls into the >1.95 bucket
146+
values = np.round(values, 2)
141147

142148
# Assign bucket indices
143149
bucket_idx = np.digitize(values, bins=cuts, right=True)
@@ -263,6 +269,8 @@ def recalculate_numerical_segments(
263269
)
264270

265271
series = df[seg_var].dropna().to_numpy()
272+
# Round values to 2 decimals to match the precision of cuts
273+
series = np.round(series, 2)
266274
is_numeric = pd.api.types.is_numeric_dtype(df[seg_var])
267275
is_integer_data = is_numeric and np.all(np.mod(series, 1) == 0)
268276

0 commit comments

Comments
 (0)