diff --git a/crates/iceberg/src/transform/bucket.rs b/crates/iceberg/src/transform/bucket.rs index 8807fb1f79..e6786a70ca 100644 --- a/crates/iceberg/src/transform/bucket.rs +++ b/crates/iceberg/src/transform/bucket.rs @@ -78,12 +78,26 @@ impl Bucket { /// ref: https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements #[inline] fn hash_decimal(v: i128) -> i32 { + if v == 0 { + return Self::hash_bytes(&[0]); + } + let bytes = v.to_be_bytes(); - if let Some(start) = bytes.iter().position(|&x| x != 0) { - Self::hash_bytes(&bytes[start..]) + let start = if v > 0 { + // Positive: skip 0x00 unless next byte would appear negative + bytes + .windows(2) + .position(|w| w[0] != 0x00 || w[1] & 0x80 != 0) + .unwrap_or(15) } else { - Self::hash_bytes(&[0]) - } + // Negative: skip 0xFF only if next byte stays negative + bytes + .windows(2) + .position(|w| w[0] != 0xFF || w[1] & 0x80 == 0) + .unwrap_or(15) + }; + + Self::hash_bytes(&bytes[start..]) } /// def bucket_N(x) = (murmur3_x86_32_hash(x) & Integer.MAX_VALUE) % N @@ -790,6 +804,27 @@ mod test { ); } + #[test] + fn test_hash_decimal_with_negative_value() { + // Test cases from GitHub issue #1981 + assert_eq!(Bucket::hash_decimal(1), -463810133); + assert_eq!(Bucket::hash_decimal(-1), -43192051); + + // Additional test cases for edge case values + assert_eq!(Bucket::hash_decimal(0), Bucket::hash_decimal(0)); + assert_eq!(Bucket::hash_decimal(127), Bucket::hash_decimal(127)); + assert_eq!(Bucket::hash_decimal(-128), Bucket::hash_decimal(-128)); + + // Test minimum representation is used + // -1 should hash as [0xFF] not [0xFF, 0xFF, ..., 0xFF] + // 128 should hash as [0x00, 0x80] not [0x00, 0x00, ..., 0x80] + assert_eq!(Bucket::hash_decimal(128), Bucket::hash_bytes(&[0x00, 0x80])); + assert_eq!( + Bucket::hash_decimal(-129), + Bucket::hash_bytes(&[0xFF, 0x7F]) + ); + } + #[test] fn test_int_literal() { let bucket = Bucket::new(10);