Skip to content

Commit 1c953c7

Browse files
committed
Fix tdigest
1 parent 8142308 commit 1c953c7

File tree

2 files changed

+25
-1
lines changed

2 files changed

+25
-1
lines changed

datafusion/core/tests/dataframe/dataframe_functions.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -510,7 +510,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
510510
+-------------------------------------------------------------------------------------+
511511
| approx_percentile_cont(Float64(0.1),Int32(2)) WITHIN GROUP [test.b DESC NULLS LAST] |
512512
+-------------------------------------------------------------------------------------+
513-
| 69 |
513+
| 100 |
514514
+-------------------------------------------------------------------------------------+
515515
");
516516

datafusion/functions-aggregate-common/src/tdigest.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,13 @@ impl TDigest {
443443
return self.max();
444444
}
445445

446+
// If rank reaches the last unit of weight, return max directly.
447+
// Without this, interpolation at the last centroid boundary can
448+
// produce p90 > p99 on sparse data (e.g. 10 values).
449+
if rank >= self.count - 1.0 {
450+
return self.max();
451+
}
452+
446453
pos = 0;
447454
t = self.count;
448455

@@ -735,6 +742,23 @@ mod tests {
735742
assert_state_roundtrip!(t);
736743
}
737744

745+
// On sparse data, higher quantiles must not return lower values than lower quantiles.
746+
#[test]
747+
fn test_sparse_dataset_quantile_ordering() {
748+
let values = vec![1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0, 1000.0];
749+
let t = TDigest::new(100);
750+
let t = t.merge_unsorted_f64(values);
751+
752+
let p50 = t.estimate_quantile(0.5);
753+
let p90 = t.estimate_quantile(0.9);
754+
let p99 = t.estimate_quantile(0.99);
755+
756+
assert!(p50 <= p90, "p50 ({p50}) should be <= p90 ({p90})");
757+
assert!(p90 <= p99, "p90 ({p90}) should be <= p99 ({p99})");
758+
assert_eq!(p90, 1000.0, "p90 should be max on boundary rank");
759+
assert_eq!(p99, 1000.0, "p99 should be max on boundary rank");
760+
}
761+
738762
#[test]
739763
fn test_size() {
740764
let t = TDigest::new(10);

0 commit comments

Comments
 (0)