Skip to content

Commit 9cbc23b

Browse files
authored
Merge pull request #10740 from sundy-li/range_pruner
chore(query): add compare domain for string types
2 parents c524ed3 + 1af7082 commit 9cbc23b

File tree

4 files changed

+196
-89
lines changed

4 files changed

+196
-89
lines changed

src/query/expression/src/property.rs

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,3 +380,112 @@ impl Domain {
380380
}
381381
}
382382
}
383+
384+
pub trait SimpleDomainCmp {
385+
fn domain_eq(&self, other: &Self) -> FunctionDomain<BooleanType>;
386+
fn domain_noteq(&self, other: &Self) -> FunctionDomain<BooleanType>;
387+
fn domain_gt(&self, other: &Self) -> FunctionDomain<BooleanType>;
388+
fn domain_gte(&self, other: &Self) -> FunctionDomain<BooleanType>;
389+
fn domain_lt(&self, other: &Self) -> FunctionDomain<BooleanType>;
390+
fn domain_lte(&self, other: &Self) -> FunctionDomain<BooleanType>;
391+
}
392+
393+
const ALL_TRUE_DOMAIN: BooleanDomain = BooleanDomain {
394+
has_true: true,
395+
has_false: false,
396+
};
397+
398+
const ALL_FALSE_DOMAIN: BooleanDomain = BooleanDomain {
399+
has_true: false,
400+
has_false: true,
401+
};
402+
403+
impl<T: Ord + PartialOrd> SimpleDomainCmp for SimpleDomain<T> {
404+
fn domain_eq(&self, other: &Self) -> FunctionDomain<BooleanType> {
405+
if self.min > other.max || self.max < other.min {
406+
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
407+
} else {
408+
FunctionDomain::Full
409+
}
410+
}
411+
412+
fn domain_noteq(&self, other: &Self) -> FunctionDomain<BooleanType> {
413+
if self.min > other.max || self.max < other.min {
414+
FunctionDomain::Domain(ALL_TRUE_DOMAIN)
415+
} else {
416+
FunctionDomain::Full
417+
}
418+
}
419+
420+
fn domain_gt(&self, other: &Self) -> FunctionDomain<BooleanType> {
421+
if self.min > other.max {
422+
FunctionDomain::Domain(ALL_TRUE_DOMAIN)
423+
} else if self.max <= other.min {
424+
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
425+
} else {
426+
FunctionDomain::Full
427+
}
428+
}
429+
430+
fn domain_gte(&self, other: &Self) -> FunctionDomain<BooleanType> {
431+
if self.min >= other.max {
432+
FunctionDomain::Domain(ALL_TRUE_DOMAIN)
433+
} else if self.max < other.min {
434+
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
435+
} else {
436+
FunctionDomain::Full
437+
}
438+
}
439+
440+
fn domain_lt(&self, other: &Self) -> FunctionDomain<BooleanType> {
441+
if self.max < other.min {
442+
FunctionDomain::Domain(ALL_TRUE_DOMAIN)
443+
} else if self.min >= other.max {
444+
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
445+
} else {
446+
FunctionDomain::Full
447+
}
448+
}
449+
450+
fn domain_lte(&self, other: &Self) -> FunctionDomain<BooleanType> {
451+
if self.max <= other.min {
452+
FunctionDomain::Domain(ALL_TRUE_DOMAIN)
453+
} else if self.min > other.max {
454+
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
455+
} else {
456+
FunctionDomain::Full
457+
}
458+
}
459+
}
460+
461+
impl SimpleDomainCmp for StringDomain {
462+
fn domain_eq(&self, other: &Self) -> FunctionDomain<BooleanType> {
463+
let (d1, d2) = self.unify(other);
464+
d1.domain_eq(&d2)
465+
}
466+
467+
fn domain_noteq(&self, other: &Self) -> FunctionDomain<BooleanType> {
468+
let (d1, d2) = self.unify(other);
469+
d1.domain_noteq(&d2)
470+
}
471+
472+
fn domain_gt(&self, other: &Self) -> FunctionDomain<BooleanType> {
473+
let (d1, d2) = self.unify(other);
474+
d1.domain_gt(&d2)
475+
}
476+
477+
fn domain_gte(&self, other: &Self) -> FunctionDomain<BooleanType> {
478+
let (d1, d2) = self.unify(other);
479+
d1.domain_gte(&d2)
480+
}
481+
482+
fn domain_lt(&self, other: &Self) -> FunctionDomain<BooleanType> {
483+
let (d1, d2) = self.unify(other);
484+
d1.domain_lt(&d2)
485+
}
486+
487+
fn domain_lte(&self, other: &Self) -> FunctionDomain<BooleanType> {
488+
let (d1, d2) = self.unify(other);
489+
d1.domain_lte(&d2)
490+
}
491+
}

src/query/expression/src/types/string.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use common_arrow::arrow::trusted_len::TrustedLen;
2020
use serde::Deserialize;
2121
use serde::Serialize;
2222

23+
use super::SimpleDomain;
2324
use crate::property::Domain;
2425
use crate::types::ArgType;
2526
use crate::types::DataType;
@@ -411,5 +412,31 @@ impl<'a> FromIterator<&'a [u8]> for StringColumnBuilder {
411412
#[derive(Debug, Clone, PartialEq, Eq)]
412413
pub struct StringDomain {
413414
pub min: Vec<u8>,
415+
// max value is None for full domain
414416
pub max: Option<Vec<u8>>,
415417
}
418+
419+
impl StringDomain {
420+
pub fn unify(&self, other: &Self) -> (SimpleDomain<Vec<u8>>, SimpleDomain<Vec<u8>>) {
421+
let mut max_size = self.min.len().max(other.min.len());
422+
if let Some(max) = &self.max {
423+
max_size = max_size.max(max.len());
424+
}
425+
if let Some(max) = &other.max {
426+
max_size = max_size.max(max.len());
427+
}
428+
429+
let max_value = vec![255; max_size + 1];
430+
431+
(
432+
SimpleDomain {
433+
min: self.min.clone(),
434+
max: self.max.clone().unwrap_or_else(|| max_value.clone()),
435+
},
436+
SimpleDomain {
437+
min: other.min.clone(),
438+
max: other.max.clone().unwrap_or_else(|| max_value.clone()),
439+
},
440+
)
441+
}
442+
}

src/query/functions/src/scalars/comparison.rs

Lines changed: 11 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ use common_expression::FunctionProperty;
4343
use common_expression::FunctionRegistry;
4444
use common_expression::FunctionSignature;
4545
use common_expression::ScalarRef;
46+
use common_expression::SimpleDomainCmp;
4647
use common_expression::ValueRef;
4748
use memchr::memmem;
4849
use regex::bytes::Regex;
@@ -125,130 +126,51 @@ fn register_variant_cmp(registry: &mut FunctionRegistry) {
125126
);
126127
}
127128

128-
fn register_string_cmp(registry: &mut FunctionRegistry) {
129-
registry.register_2_arg::<StringType, StringType, BooleanType, _, _>(
130-
"eq",
131-
FunctionProperty::default(),
132-
|_, _| FunctionDomain::Full,
133-
|lhs, rhs, _| lhs == rhs,
134-
);
135-
registry.register_2_arg::<StringType, StringType, BooleanType, _, _>(
136-
"noteq",
137-
FunctionProperty::default(),
138-
|_, _| FunctionDomain::Full,
139-
|lhs, rhs, _| lhs != rhs,
140-
);
141-
registry.register_2_arg::<StringType, StringType, BooleanType, _, _>(
142-
"gt",
143-
FunctionProperty::default(),
144-
|_, _| FunctionDomain::Full,
145-
|lhs, rhs, _| lhs > rhs,
146-
);
147-
registry.register_2_arg::<StringType, StringType, BooleanType, _, _>(
148-
"gte",
149-
FunctionProperty::default(),
150-
|_, _| FunctionDomain::Full,
151-
|lhs, rhs, _| lhs >= rhs,
152-
);
153-
registry.register_2_arg::<StringType, StringType, BooleanType, _, _>(
154-
"lt",
155-
FunctionProperty::default(),
156-
|_, _| FunctionDomain::Full,
157-
|lhs, rhs, _| lhs < rhs,
158-
);
159-
registry.register_2_arg::<StringType, StringType, BooleanType, _, _>(
160-
"lte",
161-
FunctionProperty::default(),
162-
|_, _| FunctionDomain::Full,
163-
|lhs, rhs, _| lhs <= rhs,
164-
);
165-
}
166-
167129
macro_rules! register_simple_domain_type_cmp {
168130
($registry:ident, $T:ty) => {
169131
$registry.register_2_arg::<$T, $T, BooleanType, _, _>(
170132
"eq",
171133
FunctionProperty::default(),
172-
|d1, d2| {
173-
if d1.min > d2.max || d1.max < d2.min {
174-
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
175-
} else {
176-
FunctionDomain::Full
177-
}
178-
},
134+
|d1, d2| d1.domain_eq(d2),
179135
|lhs, rhs, _| lhs == rhs,
180136
);
181137
$registry.register_2_arg::<$T, $T, BooleanType, _, _>(
182138
"noteq",
183139
FunctionProperty::default(),
184-
|d1, d2| {
185-
if d1.min > d2.max || d1.max < d2.min {
186-
FunctionDomain::Domain(ALL_TRUE_DOMAIN)
187-
} else {
188-
FunctionDomain::Full
189-
}
190-
},
140+
|d1, d2| d1.domain_noteq(d2),
191141
|lhs, rhs, _| lhs != rhs,
192142
);
193143
$registry.register_2_arg::<$T, $T, BooleanType, _, _>(
194144
"gt",
195145
FunctionProperty::default(),
196-
|d1, d2| {
197-
if d1.min > d2.max {
198-
FunctionDomain::Domain(ALL_TRUE_DOMAIN)
199-
} else if d1.max <= d2.min {
200-
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
201-
} else {
202-
FunctionDomain::Full
203-
}
204-
},
146+
|d1, d2| d1.domain_gt(d2),
205147
|lhs, rhs, _| lhs > rhs,
206148
);
207149
$registry.register_2_arg::<$T, $T, BooleanType, _, _>(
208150
"gte",
209151
FunctionProperty::default(),
210-
|d1, d2| {
211-
if d1.min >= d2.max {
212-
FunctionDomain::Domain(ALL_TRUE_DOMAIN)
213-
} else if d1.max < d2.min {
214-
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
215-
} else {
216-
FunctionDomain::Full
217-
}
218-
},
152+
|d1, d2| d1.domain_gte(d2),
219153
|lhs, rhs, _| lhs >= rhs,
220154
);
221155
$registry.register_2_arg::<$T, $T, BooleanType, _, _>(
222156
"lt",
223157
FunctionProperty::default(),
224-
|d1, d2| {
225-
if d1.max < d2.min {
226-
FunctionDomain::Domain(ALL_TRUE_DOMAIN)
227-
} else if d1.min >= d2.max {
228-
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
229-
} else {
230-
FunctionDomain::Full
231-
}
232-
},
158+
|d1, d2| d1.domain_lt(d2),
233159
|lhs, rhs, _| lhs < rhs,
234160
);
235161
$registry.register_2_arg::<$T, $T, BooleanType, _, _>(
236162
"lte",
237163
FunctionProperty::default(),
238-
|d1, d2| {
239-
if d1.max <= d2.min {
240-
FunctionDomain::Domain(ALL_TRUE_DOMAIN)
241-
} else if d1.min > d2.max {
242-
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
243-
} else {
244-
FunctionDomain::Full
245-
}
246-
},
164+
|d1, d2| d1.domain_lte(d2),
247165
|lhs, rhs, _| lhs <= rhs,
248166
);
249167
};
250168
}
251169

170+
fn register_string_cmp(registry: &mut FunctionRegistry) {
171+
register_simple_domain_type_cmp!(registry, StringType);
172+
}
173+
252174
fn register_date_cmp(registry: &mut FunctionRegistry) {
253175
register_simple_domain_type_cmp!(registry, DateType);
254176
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
statement ok
2+
create table range_t(c varchar, i int)
3+
4+
statement ok
5+
insert into range_t values ('bcd', 1), ('efg', 10)
6+
7+
query T
8+
explain select 1 from range_t where c > 'efg'
9+
----
10+
EvalScalar
11+
├── expressions: [1]
12+
├── estimated rows: 0.67
13+
└── Filter
14+
├── filters: [range_t.c (#0) > "efg"]
15+
├── estimated rows: 0.67
16+
└── TableScan
17+
├── table: default.default.range_t
18+
├── read rows: 0
19+
├── read bytes: 0
20+
├── partitions total: 1
21+
├── partitions scanned: 0
22+
├── pruning stats: [segments: <range pruning: 1 to 0>, blocks: <range pruning: 0 to 0, bloom pruning: 0 to 0>]
23+
├── push downs: [filters: [range_t.c (#0) > "efg"], limit: NONE]
24+
├── output columns: [c]
25+
└── estimated rows: 2.00
26+
27+
28+
query T
29+
explain select 1 from range_t where i > 20
30+
----
31+
EvalScalar
32+
├── expressions: [1]
33+
├── estimated rows: 0.67
34+
└── Filter
35+
├── filters: [range_t.i (#1) > 20]
36+
├── estimated rows: 0.67
37+
└── TableScan
38+
├── table: default.default.range_t
39+
├── read rows: 0
40+
├── read bytes: 0
41+
├── partitions total: 1
42+
├── partitions scanned: 0
43+
├── pruning stats: [segments: <range pruning: 1 to 0>, blocks: <range pruning: 0 to 0, bloom pruning: 0 to 0>]
44+
├── push downs: [filters: [range_t.i (#1) > 20], limit: NONE]
45+
├── output columns: [i]
46+
└── estimated rows: 2.00
47+
48+
statement ok
49+
drop table range_t

0 commit comments

Comments
 (0)