Skip to content

Commit da4eaf3

Browse files
authored
Merge pull request #10731 from sundy-li/with_capacity_hint
2 parents c04080b + 4652c47 commit da4eaf3

File tree

3 files changed

+29
-10
lines changed

3 files changed

+29
-10
lines changed

src/query/expression/src/types/string.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ impl StringColumnBuilder {
246246
let mut offsets = Vec::with_capacity(len + 1);
247247
offsets.push(0);
248248
StringColumnBuilder {
249-
need_estimated: data_capacity == 0,
249+
need_estimated: data_capacity == 0 && len > 0,
250250
data: Vec::with_capacity(data_capacity),
251251
offsets,
252252
}
@@ -330,7 +330,7 @@ impl StringColumnBuilder {
330330
let bytes_per_row = self.data.len() / 64 + 1;
331331
let bytes_estimate = bytes_per_row * self.offsets.capacity();
332332

333-
const MAX_HINT_SIZE: usize = 1000000000;
333+
const MAX_HINT_SIZE: usize = 1_000_000_000;
334334
// if we are more than 10% over the capacity, we reserve more
335335
if bytes_estimate < MAX_HINT_SIZE
336336
&& bytes_estimate as f64 > self.data.capacity() as f64 * 1.10f64

src/query/expression/src/values.rs

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1927,6 +1927,16 @@ impl ColumnBuilder {
19271927
}
19281928

19291929
pub fn with_capacity(ty: &DataType, capacity: usize) -> ColumnBuilder {
1930+
ColumnBuilder::with_capacity_hint(ty, capacity, true)
1931+
}
1932+
1933+
/// Create a new column builder with capacity and enable_datasize_hint
1934+
/// enable_datasize_hint is used in StringColumnBuilder to decide whether to pre-allocate values
1935+
pub fn with_capacity_hint(
1936+
ty: &DataType,
1937+
capacity: usize,
1938+
enable_datasize_hint: bool,
1939+
) -> ColumnBuilder {
19301940
match ty {
19311941
DataType::Null => ColumnBuilder::Null { len: 0 },
19321942
DataType::EmptyArray => ColumnBuilder::EmptyArray { len: 0 },
@@ -1939,27 +1949,28 @@ impl ColumnBuilder {
19391949
}
19401950
DataType::Boolean => ColumnBuilder::Boolean(MutableBitmap::with_capacity(capacity)),
19411951
DataType::String => {
1942-
ColumnBuilder::String(StringColumnBuilder::with_capacity(capacity, 0))
1952+
let data_capacity = if enable_datasize_hint { 0 } else { capacity };
1953+
ColumnBuilder::String(StringColumnBuilder::with_capacity(capacity, data_capacity))
19431954
}
19441955
DataType::Timestamp => ColumnBuilder::Timestamp(Vec::with_capacity(capacity)),
19451956
DataType::Date => ColumnBuilder::Date(Vec::with_capacity(capacity)),
19461957
DataType::Nullable(ty) => ColumnBuilder::Nullable(Box::new(NullableColumnBuilder {
1947-
builder: Self::with_capacity(ty, capacity),
1958+
builder: Self::with_capacity_hint(ty, capacity, enable_datasize_hint),
19481959
validity: MutableBitmap::with_capacity(capacity),
19491960
})),
19501961
DataType::Array(ty) => {
19511962
let mut offsets = Vec::with_capacity(capacity + 1);
19521963
offsets.push(0);
19531964
ColumnBuilder::Array(Box::new(ArrayColumnBuilder {
1954-
builder: Self::with_capacity(ty, 0),
1965+
builder: Self::with_capacity_hint(ty, 0, enable_datasize_hint),
19551966
offsets,
19561967
}))
19571968
}
19581969
DataType::Map(ty) => {
19591970
let mut offsets = Vec::with_capacity(capacity + 1);
19601971
offsets.push(0);
19611972
ColumnBuilder::Map(Box::new(ArrayColumnBuilder {
1962-
builder: Self::with_capacity(ty, 0),
1973+
builder: Self::with_capacity_hint(ty, 0, enable_datasize_hint),
19631974
offsets,
19641975
}))
19651976
}
@@ -1968,12 +1979,15 @@ impl ColumnBuilder {
19681979
ColumnBuilder::Tuple(
19691980
fields
19701981
.iter()
1971-
.map(|field| Self::with_capacity(field, capacity))
1982+
.map(|field| {
1983+
Self::with_capacity_hint(field, capacity, enable_datasize_hint)
1984+
})
19721985
.collect(),
19731986
)
19741987
}
19751988
DataType::Variant => {
1976-
ColumnBuilder::Variant(StringColumnBuilder::with_capacity(capacity, 0))
1989+
let data_capacity = if enable_datasize_hint { 0 } else { capacity };
1990+
ColumnBuilder::Variant(StringColumnBuilder::with_capacity(capacity, data_capacity))
19771991
}
19781992
DataType::Generic(_) => {
19791993
unreachable!("unable to initialize column builder for generic type")

src/query/pipeline/sources/src/input_formats/input_format_text.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,11 @@ impl<T: InputFormatTextBase> BlockBuilder<T> {
463463
.mutable_columns
464464
.iter_mut()
465465
.map(|col| {
466-
let empty_builder = ColumnBuilder::with_capacity(&col.data_type(), 0);
466+
let empty_builder = ColumnBuilder::with_capacity_hint(
467+
&col.data_type(),
468+
self.ctx.block_compact_thresholds.min_rows_per_block,
469+
false,
470+
);
467471
std::mem::replace(col, empty_builder).build()
468472
})
469473
.collect();
@@ -504,9 +508,10 @@ impl<T: InputFormatTextBase> BlockBuilderTrait for BlockBuilder<T> {
504508
.fields()
505509
.iter()
506510
.map(|f| {
507-
ColumnBuilder::with_capacity(
511+
ColumnBuilder::with_capacity_hint(
508512
&f.data_type().into(),
509513
ctx.block_compact_thresholds.min_rows_per_block,
514+
false,
510515
)
511516
})
512517
.collect();

0 commit comments

Comments
 (0)