Skip to content

Commit 8cc8c11

Browse files
authored
Optimize muti-column grouping with StringView/ByteView (option 2) - 25% faster (#19413)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Part of #18411 - Closes #19344 - Closes #19364 Note this is an alternate to #19364 ## Rationale for this change @camuel found a query where DuckDB's raw grouping is is faster. I looked into it and much of the difference can be explained by better vectorization in the comparisons and short string optimizations ## What changes are included in this PR? Optimize (will comment inline) ## Are these changes tested? By CI. See also benchmark results below. I tested manually as well Create Data: ```shell nice tpchgen-cli --tables=lineitem --format=parquet --scale-factor 100 ``` Run query: ```shell hyperfine --warmup 3 " datafusion-cli -c \"select l_returnflag,l_linestatus, count(*) as count_order from 'lineitem.parquet' group by l_returnflag, l_linestatus;\" " ``` Before (main): 1.368s ```shell Benchmark 1: datafusion-cli -c "select l_returnflag,l_linestatus, count(*) as count_order from 'lineitem.parquet' group by l_returnflag, l_linestatus;" Time (mean ± σ): 1.393 s ± 0.020 s [User: 16.778 s, System: 0.688 s] Range (min … max): 1.368 s … 1.438 s 10 runs ``` After (this PR) 1.022s ```shell Benchmark 1: ./datafusion-cli-multi-gby-try2 -c "select l_returnflag,l_linestatus, count(*) as count_order from 'lineitem.parquet' group by l_returnflag, l_linestatus;" Time (mean ± σ): 1.022 s ± 0.015 s [User: 11.685 s, System: 0.644 s] Range (min … max): 1.005 s … 1.052 s 10 runs ``` I have a PR that improves string view hashing performance too, see - #19374 ## Are there any user-facing changes? Faster performance
1 parent 5419ff5 commit 8cc8c11

File tree

1 file changed

+78
-43
lines changed
  • datafusion/physical-plan/src/aggregates/group_values/multi_group_by

1 file changed

+78
-43
lines changed

datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs

Lines changed: 78 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
9999

100100
fn equal_to_inner(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool {
101101
let array = array.as_byte_view::<B>();
102-
self.do_equal_to_inner(lhs_row, array, rhs_row)
102+
// since this is a single row comparison, don't bother specializing for nulls/buffers
103+
self.do_equal_to_inner::<true, true>(lhs_row, array, rhs_row)
103104
}
104105

105106
fn append_val_inner(&mut self, array: &ArrayRef, row: usize) {
@@ -117,15 +118,16 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
117118
self.do_append_val_inner(arr, row);
118119
}
119120

120-
fn vectorized_equal_to_inner(
121+
// Don't inline to keep the code small and give LLVM the best chance of
122+
// vectorizing the inner loop
123+
#[inline(never)]
124+
fn vectorized_equal_to_inner<const HAS_NULLS: bool, const HAS_BUFFERS: bool>(
121125
&self,
122126
lhs_rows: &[usize],
123-
array: &ArrayRef,
127+
array: &GenericByteViewArray<B>,
124128
rhs_rows: &[usize],
125129
equal_to_results: &mut [bool],
126130
) {
127-
let array = array.as_byte_view::<B>();
128-
129131
let iter = izip!(
130132
lhs_rows.iter(),
131133
rhs_rows.iter(),
@@ -138,7 +140,8 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
138140
continue;
139141
}
140142

141-
*equal_to_result = self.do_equal_to_inner(lhs_row, array, rhs_row);
143+
*equal_to_result =
144+
self.do_equal_to_inner::<HAS_NULLS, HAS_BUFFERS>(lhs_row, array, rhs_row);
142145
}
143146
}
144147

@@ -216,26 +219,42 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
216219
}
217220
}
218221

219-
fn do_equal_to_inner(
222+
/// Compare the value at `lhs_row` in this builder with
223+
/// the value at `rhs_row` in input `array`
224+
///
225+
/// Templated so that the inner compare loop can be
226+
/// specialized based on the input array
227+
#[inline(always)]
228+
fn do_equal_to_inner<const HAS_NULLS: bool, const HAS_BUFFERS: bool>(
220229
&self,
221230
lhs_row: usize,
222231
array: &GenericByteViewArray<B>,
223232
rhs_row: usize,
224233
) -> bool {
225234
// Check if nulls equal firstly
226-
let exist_null = self.nulls.is_null(lhs_row);
227-
let input_null = array.is_null(rhs_row);
228-
if let Some(result) = nulls_equal_to(exist_null, input_null) {
229-
return result;
235+
if HAS_NULLS {
236+
let exist_null = self.nulls.is_null(lhs_row);
237+
let input_null = array.is_null(rhs_row);
238+
if let Some(result) = nulls_equal_to(exist_null, input_null) {
239+
return result;
240+
}
230241
}
231242

232243
// Otherwise, we need to check their values
233-
let exist_view = self.views[lhs_row];
244+
245+
// SAFETY: the `lhs_row` and rhs_row` are valid
246+
let exist_view = unsafe { *self.views.get_unchecked(lhs_row) };
234247
let exist_view_len = exist_view as u32;
235248

236-
let input_view = array.views()[rhs_row];
249+
let input_view = unsafe { *array.views().get_unchecked(rhs_row) };
237250
let input_view_len = input_view as u32;
238251

252+
// fast path, if we know there are no buffers, then the view must be inlined
253+
// so we can simply compare the u128 views
254+
if !HAS_BUFFERS {
255+
return exist_view == input_view;
256+
}
257+
239258
// The check logic
240259
// - Check len equality
241260
// - If inlined, check inlined value
@@ -246,19 +265,8 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
246265
}
247266

248267
if exist_view_len <= 12 {
249-
let exist_inline = unsafe {
250-
GenericByteViewArray::<B>::inline_value(
251-
&exist_view,
252-
exist_view_len as usize,
253-
)
254-
};
255-
let input_inline = unsafe {
256-
GenericByteViewArray::<B>::inline_value(
257-
&input_view,
258-
input_view_len as usize,
259-
)
260-
};
261-
exist_inline == input_inline
268+
// both inlined, so compare inlined value
269+
exist_view == input_view
262270
} else {
263271
let exist_prefix =
264272
unsafe { GenericByteViewArray::<B>::inline_value(&exist_view, 4) };
@@ -269,30 +277,28 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
269277
return false;
270278
}
271279

280+
// get the full values and compare
272281
let exist_full = {
273282
let byte_view = ByteView::from(exist_view);
274-
self.value(
275-
byte_view.buffer_index as usize,
276-
byte_view.offset as usize,
277-
byte_view.length as usize,
278-
)
283+
let buffer_index = byte_view.buffer_index as usize;
284+
let offset = byte_view.offset as usize;
285+
let length = byte_view.length as usize;
286+
debug_assert!(buffer_index <= self.completed.len());
287+
288+
unsafe {
289+
if buffer_index < self.completed.len() {
290+
let block = self.completed.get_unchecked(buffer_index);
291+
block.as_slice().get_unchecked(offset..offset + length)
292+
} else {
293+
self.in_progress.get_unchecked(offset..offset + length)
294+
}
295+
}
279296
};
280297
let input_full: &[u8] = unsafe { array.value_unchecked(rhs_row).as_ref() };
281298
exist_full == input_full
282299
}
283300
}
284301

285-
fn value(&self, buffer_index: usize, offset: usize, length: usize) -> &[u8] {
286-
debug_assert!(buffer_index <= self.completed.len());
287-
288-
if buffer_index < self.completed.len() {
289-
let block = &self.completed[buffer_index];
290-
&block[offset..offset + length]
291-
} else {
292-
&self.in_progress[offset..offset + length]
293-
}
294-
}
295-
296302
fn build_inner(self) -> ArrayRef {
297303
let Self {
298304
views,
@@ -507,7 +513,36 @@ impl<B: ByteViewType> GroupColumn for ByteViewGroupValueBuilder<B> {
507513
rows: &[usize],
508514
equal_to_results: &mut [bool],
509515
) {
510-
self.vectorized_equal_to_inner(group_indices, array, rows, equal_to_results);
516+
let has_nulls = array.null_count() != 0;
517+
let array = array.as_byte_view::<B>();
518+
let has_buffers = !array.data_buffers().is_empty();
519+
// call specialized version based on nulls and buffers presence
520+
match (has_nulls, has_buffers) {
521+
(true, true) => self.vectorized_equal_to_inner::<true, true>(
522+
group_indices,
523+
array,
524+
rows,
525+
equal_to_results,
526+
),
527+
(true, false) => self.vectorized_equal_to_inner::<true, false>(
528+
group_indices,
529+
array,
530+
rows,
531+
equal_to_results,
532+
),
533+
(false, true) => self.vectorized_equal_to_inner::<false, true>(
534+
group_indices,
535+
array,
536+
rows,
537+
equal_to_results,
538+
),
539+
(false, false) => self.vectorized_equal_to_inner::<false, false>(
540+
group_indices,
541+
array,
542+
rows,
543+
equal_to_results,
544+
),
545+
}
511546
}
512547

513548
fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize]) -> Result<()> {

0 commit comments

Comments
 (0)