-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Optimize delta binary decoder in the case where bitwidth=0 #9477
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
14fdbf6
2610ce2
9fd7912
11fb149
1361d8a
41a2467
e75f334
7021eee
15274cb
324a63c
f0ab4dc
64626cc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -770,15 +770,48 @@ where | |
|
|
||
| // At this point we have read the deltas to `buffer` we now need to offset | ||
| // these to get back to the original values that were encoded | ||
| for v in &mut buffer[read..read + batch_read] { | ||
| // | ||
| // Optimization: if the bit_width for the miniblock is 0, then we can employ | ||
| // a faster decoding method than setting `value[i] = value[i-1] + value[i] + min_delta`. | ||
| // Where min_delta is 0 (all values in the miniblock are the same), we can simply | ||
| // set all values to `self.last_value`. In the case of non-zero min_delta (values | ||
| // in the mini-block form an arithmetic progression) each value can be computed via | ||
| // `value[i] = (i + 1) * min_delta + last_value`. In both cases we remove the | ||
| // dependence on the preceding value. | ||
| // Kudos to @pitrou for the idea https://github.com/apache/arrow/pull/49296 | ||
| let min_delta = self.min_delta.as_i64()?; | ||
| if bit_width == 0 { | ||
| if min_delta == 0 { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we repeat the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can try. The theory, however, is that the speed up is from not having to use the previous value to calculate the current. I'll see if there's any speedup from avoiding the additional
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. It's good for 5-10% on my laptop. |
||
| buffer[read..read + batch_read].fill(self.last_value); | ||
| } else { | ||
| // the c++ version multiplies min_delta by the iter index, but doing | ||
| // wrapping_mul through T::T was a bit slower. this is still | ||
| // faster than before. | ||
| let mut delta = self.min_delta; | ||
| for v in &mut buffer[read..read + batch_read] { | ||
| *v = self.last_value.wrapping_add(&delta); | ||
| delta = delta.wrapping_add(&self.min_delta); | ||
| } | ||
|
|
||
| self.last_value = buffer[read + batch_read - 1]; | ||
| } | ||
| } else { | ||
| // It is OK for deltas to contain "overflowed" values after encoding, | ||
| // e.g. i64::MAX - i64::MIN, so we use `wrapping_add` to "overflow" again and | ||
| // restore original value. | ||
| *v = v | ||
| .wrapping_add(&self.min_delta) | ||
| .wrapping_add(&self.last_value); | ||
|
|
||
| self.last_value = *v; | ||
| if min_delta == 0 { | ||
| for v in &mut buffer[read..read + batch_read] { | ||
| *v = v.wrapping_add(&self.last_value); | ||
| self.last_value = *v; | ||
| } | ||
| } else { | ||
| for v in &mut buffer[read..read + batch_read] { | ||
| *v = v | ||
| .wrapping_add(&self.min_delta) | ||
| .wrapping_add(&self.last_value); | ||
| self.last_value = *v; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| read += batch_read; | ||
|
|
@@ -1802,6 +1835,49 @@ mod tests { | |
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_delta_bit_packed_int32_single_value_large() { | ||
| let block_data = vec![3; 10240]; | ||
| test_delta_bit_packed_decode::<Int32Type>(vec![block_data]); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_delta_bit_packed_int32_increasing_value_large() { | ||
| let block_data = (0i32..10240).collect(); | ||
| test_delta_bit_packed_decode::<Int32Type>(vec![block_data]); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_delta_bit_packed_int32_mixed_large() { | ||
| // should be enough for 4 mini-blocks plus a little so we get some | ||
| // mixed mini-blocks | ||
| const BLOCK_SIZE: i32 = 133; | ||
| let block1_data = (0..BLOCK_SIZE).map(|i| (i * 7) % 11).collect(); | ||
| let block2_data = vec![3; BLOCK_SIZE as usize]; | ||
| let block3_data = (0..BLOCK_SIZE).map(|i| (i * 5) % 13).collect(); | ||
| let block4_data = (0..BLOCK_SIZE).collect(); | ||
| let block5_data = (0..BLOCK_SIZE).map(|i| (i * 3) % 17).collect(); | ||
| test_delta_bit_packed_decode::<Int32Type>(vec![ | ||
| block1_data, | ||
| block2_data, | ||
| block3_data, | ||
| block4_data, | ||
| block5_data, | ||
| ]); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_delta_bit_packed_int64_single_value_large() { | ||
| let block_data = vec![5; 10240]; | ||
| test_delta_bit_packed_decode::<Int64Type>(vec![block_data]); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_delta_bit_packed_int64_increasing_value_large() { | ||
| let block_data = (0i64..10240).collect(); | ||
| test_delta_bit_packed_decode::<Int64Type>(vec![block_data]); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_delta_byte_array_same_arrays() { | ||
| let data = vec![ | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you submit a PR for just these benchmarks, so we can run the benchmarks on this PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do have a separate branch for the benches (and will submit separately), but combined them in this draft. I'd like to do some tests with the other delta encodings to see if the optimization here has an impact.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, done. #9500