Skip to content

Commit 1a77f08

Browse files
hdikemanfacebook-github-bot
authored andcommitted
fix: Allow merging OR-ed bigint single value ranges (facebookincubator#15738)
Summary: Since BIGINT ranges can sometimes be converted to values filters (i.e. if all the range filters are single value), sequential merging of several BIGINT ranges can cause a generic MultiRange filter to be created, which cannot be evaluated. e.g. `(id = 100 OR id = 200 OR id = 300)` is converted by HiveConnectorUtil::extractFiltersFromRemainingFilter in the following stages: ``` I1209 13:29:11.955765 1472505 [CPUThreadPool0] ExprToSubfieldFilter.cpp:584] tryMergeBigintRanges called with 2 disjuncts I1209 13:29:11.955819 1472505 [CPUThreadPool0] ExprToSubfieldFilter.cpp:586] Disjunct 0: BigintRange: [100, 100] no nulls I1209 13:29:11.955826 1472505 [CPUThreadPool0] ExprToSubfieldFilter.cpp:586] Disjunct 1: BigintRange: [200, 200] no nulls I1209 13:29:11.955830 1472505 [CPUThreadPool0] ExprToSubfieldFilter.cpp:602] Creating BigintValues with 2 values I1209 13:29:11.955852 1472505 [CPUThreadPool0] ExprToSubfieldFilter.cpp:584] tryMergeBigintRanges called with 2 disjuncts I1209 13:29:11.955862 1472505 [CPUThreadPool0] ExprToSubfieldFilter.cpp:586] Disjunct 0: Filter(BigintValuesUsingBitmask, deterministic, no nulls) I1209 13:29:11.955871 1472505 [CPUThreadPool0] ExprToSubfieldFilter.cpp:586] Disjunct 1: BigintRange: [300, 300] no nulls W1209 13:29:11.955875 1472505 [CPUThreadPool0] ExprToSubfieldFilter.cpp:609] tryMergeBigintRanges returning nullptr - not all disjuncts are BigintRange/BigintMultiRange ``` The filter is instead merged into a generic MultiRange filter with bigint ranges, which cannot be evaluated: ``` Filter(MultiRange, deterministic, no nulls): testInt64Range() is not supported. ``` Generic MultiRange does not support BIGINT type by design. Instead of converting to a MultiRange, merge logic should have explicit handling for BIGINT value filters so it can handle these gracefully Differential Revision: D88789308
1 parent e7810c9 commit 1a77f08

File tree

3 files changed

+334
-7
lines changed

3 files changed

+334
-7
lines changed

velox/connectors/hive/tests/HiveConnectorTest.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,23 @@ TEST_F(HiveConnectorTest, disjuncts) {
739739
ASSERT_EQ(filters.begin()->first, Subfield("c0"));
740740
VELOX_ASSERT_FILTER(exec::between(1, 12), filters.begin()->second);
741741
}
742+
743+
{
744+
auto expr = parseExpr("c0 not in (1, 3) or c0 in (1, 2)", rowType);
745+
SubfieldFilters filters;
746+
double sampleRate = 1;
747+
auto remaining = extractFiltersFromRemainingFilter(
748+
expr, &evaluator, filters, sampleRate);
749+
ASSERT_EQ(sampleRate, 1);
750+
ASSERT_FALSE(remaining);
751+
ASSERT_EQ(filters.size(), 1);
752+
VELOX_ASSERT_FILTER(
753+
exec::bigintOr(
754+
exec::between(
755+
std::numeric_limits<int64_t>::min(), static_cast<int64_t>(2)),
756+
exec::greaterThanOrEqual(static_cast<int64_t>(4))),
757+
filters.begin()->second);
758+
}
742759
}
743760

744761
#undef VELOX_ASSERT_FILTER

velox/expression/ExprToSubfieldFilter.cpp

Lines changed: 134 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,68 @@ bool isBigintMultiRange(const std::unique_ptr<common::Filter>& filter) {
5858
return filter->is(common::FilterKind::kBigintMultiRange);
5959
}
6060

61+
bool isBigintValues(const std::unique_ptr<common::Filter>& filter) {
62+
return filter->is(common::FilterKind::kBigintValuesUsingHashTable) ||
63+
filter->is(common::FilterKind::kBigintValuesUsingBitmask);
64+
}
65+
66+
bool isNegatedBigintRange(const std::unique_ptr<common::Filter>& filter) {
67+
return filter->is(common::FilterKind::kNegatedBigintRange);
68+
}
69+
70+
bool isNegatedBigintValues(const std::unique_ptr<common::Filter>& filter) {
71+
return filter->is(common::FilterKind::kNegatedBigintValuesUsingHashTable) ||
72+
filter->is(common::FilterKind::kNegatedBigintValuesUsingBitmask);
73+
}
74+
75+
std::vector<std::unique_ptr<common::BigintRange>> invertNegatedBigintRange(
76+
const common::NegatedBigintRange* negatedRange) {
77+
std::vector<std::unique_ptr<common::BigintRange>> ranges;
78+
int64_t lower = negatedRange->lower();
79+
int64_t upper = negatedRange->upper();
80+
81+
if (lower > std::numeric_limits<int64_t>::min()) {
82+
ranges.emplace_back(
83+
std::make_unique<common::BigintRange>(
84+
std::numeric_limits<int64_t>::min(), lower - 1, false));
85+
}
86+
if (upper < std::numeric_limits<int64_t>::max()) {
87+
ranges.emplace_back(
88+
std::make_unique<common::BigintRange>(
89+
upper + 1, std::numeric_limits<int64_t>::max(), false));
90+
}
91+
return ranges;
92+
}
93+
94+
std::vector<std::unique_ptr<common::BigintRange>> invertNegatedBigintValues(
95+
const std::vector<int64_t>& values) {
96+
VELOX_CHECK(std::is_sorted(values.begin(), values.end()));
97+
std::vector<std::unique_ptr<common::BigintRange>> ranges;
98+
ranges.reserve(values.size() + 1);
99+
100+
if (values.front() > std::numeric_limits<int64_t>::min()) {
101+
ranges.emplace_back(
102+
std::make_unique<common::BigintRange>(
103+
std::numeric_limits<int64_t>::min(), values.front() - 1, false));
104+
}
105+
106+
for (size_t i = 0; i < values.size() - 1; ++i) {
107+
if (values[i] + 1 <= values[i + 1] - 1) {
108+
ranges.emplace_back(
109+
std::make_unique<common::BigintRange>(
110+
values[i] + 1, values[i + 1] - 1, false));
111+
}
112+
}
113+
114+
if (values.back() < std::numeric_limits<int64_t>::max()) {
115+
ranges.emplace_back(
116+
std::make_unique<common::BigintRange>(
117+
values.back() + 1, std::numeric_limits<int64_t>::max(), false));
118+
}
119+
120+
return ranges;
121+
}
122+
61123
template <typename T, typename U>
62124
std::unique_ptr<T> asUniquePtr(std::unique_ptr<U> ptr) {
63125
return std::unique_ptr<T>(static_cast<T*>(ptr.release()));
@@ -597,25 +659,90 @@ std::unique_ptr<common::Filter> tryMergeBigintRanges(
597659
}
598660

599661
if (!std::all_of(disjuncts.begin(), disjuncts.end(), [](const auto& filter) {
600-
return isBigintRange(filter) || isBigintMultiRange(filter);
662+
return isBigintRange(filter) || isBigintMultiRange(filter) ||
663+
isBigintValues(filter) || isNegatedBigintRange(filter) ||
664+
isNegatedBigintValues(filter);
601665
})) {
602666
return nullptr;
603667
}
604668

605669
const bool nullAllowed = isNullAllowed(disjuncts);
606670

671+
std::vector<int64_t> values;
607672
std::vector<std::unique_ptr<common::BigintRange>> ranges;
673+
608674
for (auto& filter : disjuncts) {
609-
if (isBigintRange(filter)) {
610-
ranges.emplace_back(asBigintRange(filter));
611-
} else {
612-
for (const auto& range :
613-
filter->as<common::BigintMultiRange>()->ranges()) {
614-
ranges.emplace_back(std::make_unique<common::BigintRange>(*range));
675+
switch (filter->kind()) {
676+
case common::FilterKind::kBigintRange: {
677+
auto* range = filter->as<common::BigintRange>();
678+
if (range->isSingleValue()) {
679+
values.push_back(range->lower());
680+
} else {
681+
ranges.emplace_back(asBigintRange(filter));
682+
}
683+
break;
684+
}
685+
case common::FilterKind::kBigintMultiRange: {
686+
for (const auto& range :
687+
filter->as<common::BigintMultiRange>()->ranges()) {
688+
if (range->isSingleValue()) {
689+
values.push_back(range->lower());
690+
} else {
691+
ranges.emplace_back(std::make_unique<common::BigintRange>(*range));
692+
}
693+
}
694+
break;
695+
}
696+
case common::FilterKind::kBigintValuesUsingHashTable:
697+
case common::FilterKind::kBigintValuesUsingBitmask: {
698+
std::vector<int64_t> vals;
699+
if (filter->is(common::FilterKind::kBigintValuesUsingHashTable)) {
700+
vals = filter->as<common::BigintValuesUsingHashTable>()->values();
701+
} else {
702+
vals = filter->as<common::BigintValuesUsingBitmask>()->values();
703+
}
704+
values.insert(values.end(), vals.begin(), vals.end());
705+
break;
706+
}
707+
case common::FilterKind::kNegatedBigintRange: {
708+
auto* negatedRange = filter->as<common::NegatedBigintRange>();
709+
auto convertedRanges = invertNegatedBigintRange(negatedRange);
710+
for (auto& range : convertedRanges) {
711+
ranges.emplace_back(std::move(range));
712+
}
713+
break;
714+
}
715+
case common::FilterKind::kNegatedBigintValuesUsingHashTable:
716+
case common::FilterKind::kNegatedBigintValuesUsingBitmask: {
717+
std::vector<int64_t> vals;
718+
if (filter->is(
719+
common::FilterKind::kNegatedBigintValuesUsingHashTable)) {
720+
vals =
721+
filter->as<common::NegatedBigintValuesUsingHashTable>()->values();
722+
} else {
723+
vals =
724+
filter->as<common::NegatedBigintValuesUsingBitmask>()->values();
725+
}
726+
auto convertedRanges = invertNegatedBigintValues(vals);
727+
for (auto& range : convertedRanges) {
728+
ranges.emplace_back(std::move(range));
729+
}
730+
break;
615731
}
732+
default:
733+
VELOX_UNREACHABLE();
616734
}
617735
}
618736

737+
if (ranges.empty()) {
738+
return common::createBigintValues(values, nullAllowed);
739+
}
740+
741+
for (int64_t value : values) {
742+
ranges.emplace_back(
743+
std::make_unique<common::BigintRange>(value, value, nullAllowed));
744+
}
745+
619746
std::sort(ranges.begin(), ranges.end(), [](const auto& a, const auto& b) {
620747
return a->lower() < b->lower();
621748
});

velox/expression/tests/ExprToSubfieldFilterTest.cpp

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,189 @@ TEST_F(ExprToSubfieldFilterTest, makeOrFilterBigint) {
283283
}
284284
}
285285

286+
TEST_F(ExprToSubfieldFilterTest, makeOrFilterBigintValues) {
287+
// (a in (100)) + (a in (200)) -> a in (100, 200)
288+
{
289+
auto values1 = in({100});
290+
auto values2 = in({200});
291+
auto expected = in({100, 200});
292+
293+
VELOX_ASSERT_FILTER(expected, makeOr(values1->clone(), values2->clone()));
294+
}
295+
296+
// (a in (100)) + (a = 200) -> a in (100, 200)
297+
{
298+
auto values = in({100});
299+
auto singleValueRange = equal(200);
300+
auto expected = in({100, 200});
301+
302+
VELOX_ASSERT_FILTER(
303+
expected, makeOr(values->clone(), singleValueRange->clone()));
304+
}
305+
306+
// (a in (5)) + (a between 10 and 20) -> a = 5 or a between 10 and 20
307+
{
308+
auto values = in({5});
309+
auto multiValueRange = between(10, 20);
310+
auto expected = bigintOr(equal(5), between(10, 20));
311+
312+
VELOX_ASSERT_FILTER(
313+
expected, makeOr(values->clone(), multiValueRange->clone()));
314+
}
315+
316+
// (a = 100 or a = 200) + (a = 300) -> a in (100, 200, 300)
317+
{
318+
std::vector<std::unique_ptr<common::BigintRange>> ranges;
319+
ranges.emplace_back(std::make_unique<common::BigintRange>(100, 100, false));
320+
ranges.emplace_back(std::make_unique<common::BigintRange>(200, 200, false));
321+
auto multiRangeAllSingle =
322+
std::make_unique<common::BigintMultiRange>(std::move(ranges), false);
323+
324+
auto singleValueRange = equal(300);
325+
auto expected = in({100, 200, 300});
326+
327+
VELOX_ASSERT_FILTER(
328+
expected,
329+
makeOr(singleValueRange->clone(), multiRangeAllSingle->clone()));
330+
}
331+
332+
// (a = 5 or a between 10 and 20) + (a = 100) ->
333+
// (a = 5 or a between 10 and 20 or a = 100)
334+
{
335+
std::vector<std::unique_ptr<common::BigintRange>> ranges;
336+
ranges.emplace_back(std::make_unique<common::BigintRange>(5, 5, false));
337+
ranges.emplace_back(std::make_unique<common::BigintRange>(10, 20, false));
338+
auto multiRangeMixed =
339+
std::make_unique<common::BigintMultiRange>(std::move(ranges), false);
340+
341+
auto singleValueRange = equal(100);
342+
343+
std::vector<std::unique_ptr<common::BigintRange>> expectedRanges;
344+
expectedRanges.emplace_back(
345+
std::make_unique<common::BigintRange>(5, 5, false));
346+
expectedRanges.emplace_back(
347+
std::make_unique<common::BigintRange>(10, 20, false));
348+
expectedRanges.emplace_back(
349+
std::make_unique<common::BigintRange>(100, 100, false));
350+
auto expected = std::make_unique<common::BigintMultiRange>(
351+
std::move(expectedRanges), false);
352+
353+
VELOX_ASSERT_FILTER(
354+
expected, makeOr(singleValueRange->clone(), multiRangeMixed->clone()));
355+
}
356+
357+
// (a = 5 or a = 15) + (a between 10 and 20) -> (a = 5 or a between 10 and 20)
358+
{
359+
std::vector<std::unique_ptr<common::BigintRange>> ranges;
360+
ranges.emplace_back(std::make_unique<common::BigintRange>(5, 5, false));
361+
ranges.emplace_back(std::make_unique<common::BigintRange>(15, 15, false));
362+
auto multiRangeAllSingle =
363+
std::make_unique<common::BigintMultiRange>(std::move(ranges), false);
364+
365+
auto multiValueRange = between(10, 20);
366+
367+
std::vector<std::unique_ptr<common::BigintRange>> expectedRanges;
368+
expectedRanges.emplace_back(
369+
std::make_unique<common::BigintRange>(5, 5, false));
370+
expectedRanges.emplace_back(
371+
std::make_unique<common::BigintRange>(10, 20, false));
372+
auto expected = std::make_unique<common::BigintMultiRange>(
373+
std::move(expectedRanges), false);
374+
375+
VELOX_ASSERT_FILTER(
376+
expected,
377+
makeOr(multiValueRange->clone(), multiRangeAllSingle->clone()));
378+
}
379+
380+
// (a = 5 or a between 25 and 30) + (a between 10 and 20) ->
381+
// (a = 5 or a between 10 and 20 or a between 25 and 30)
382+
{
383+
std::vector<std::unique_ptr<common::BigintRange>> ranges;
384+
ranges.emplace_back(std::make_unique<common::BigintRange>(5, 5, false));
385+
ranges.emplace_back(std::make_unique<common::BigintRange>(25, 30, false));
386+
auto multiRangeMixed =
387+
std::make_unique<common::BigintMultiRange>(std::move(ranges), false);
388+
389+
auto multiValueRange = between(10, 20);
390+
391+
std::vector<std::unique_ptr<common::BigintRange>> expectedRanges;
392+
expectedRanges.emplace_back(
393+
std::make_unique<common::BigintRange>(5, 5, false));
394+
expectedRanges.emplace_back(
395+
std::make_unique<common::BigintRange>(10, 20, false));
396+
expectedRanges.emplace_back(
397+
std::make_unique<common::BigintRange>(25, 30, false));
398+
auto expected = std::make_unique<common::BigintMultiRange>(
399+
std::move(expectedRanges), false);
400+
401+
VELOX_ASSERT_FILTER(
402+
expected, makeOr(multiValueRange->clone(), multiRangeMixed->clone()));
403+
}
404+
405+
// (a not in (5)) + (a = 10) -> all values except 5
406+
{
407+
auto negatedValues = notIn({5});
408+
auto singleValue = equal(10);
409+
auto expected = bigintOr(
410+
between(std::numeric_limits<int64_t>::min(), 4), greaterThanOrEqual(6));
411+
412+
VELOX_ASSERT_FILTER(
413+
expected, makeOr(negatedValues->clone(), singleValue->clone()));
414+
}
415+
416+
// (a not in (1, 3)) + (a in (1, 2)) ->
417+
// (a between INT64_MIN and 2 or a between 3 and INT64_MAX)
418+
{
419+
auto negatedValues = notIn({1, 3});
420+
auto values = in({1, 2});
421+
auto expected = bigintOr(
422+
between(std::numeric_limits<int64_t>::min(), 2), greaterThanOrEqual(4));
423+
424+
VELOX_ASSERT_FILTER(
425+
expected, makeOr(negatedValues->clone(), values->clone()));
426+
}
427+
428+
// (a != 10) + (a = 10) -> all non-null values
429+
{
430+
auto negatedRange = notEqual(10);
431+
auto singleValue = equal(10);
432+
433+
VELOX_ASSERT_FILTER(
434+
isNotNull(), makeOr(negatedRange->clone(), singleValue->clone()));
435+
}
436+
437+
// (a not in (5, 10, 15)) + (a between 8 and 12) ->
438+
// [INT64_MIN, 4] or [6, 14] or [16, INT64_MAX]
439+
{
440+
auto negatedValues = notIn({5, 10, 15});
441+
auto range = between(8, 12);
442+
443+
std::vector<std::unique_ptr<common::BigintRange>> expectedRanges;
444+
expectedRanges.emplace_back(
445+
std::make_unique<common::BigintRange>(
446+
std::numeric_limits<int64_t>::min(), 4, false));
447+
expectedRanges.emplace_back(
448+
std::make_unique<common::BigintRange>(6, 14, false));
449+
expectedRanges.emplace_back(
450+
std::make_unique<common::BigintRange>(
451+
16, std::numeric_limits<int64_t>::max(), false));
452+
auto expected = std::make_unique<common::BigintMultiRange>(
453+
std::move(expectedRanges), false);
454+
455+
VELOX_ASSERT_FILTER(
456+
expected, makeOr(negatedValues->clone(), range->clone()));
457+
}
458+
459+
// (a != 5) + (a != 10) -> all non-null values
460+
{
461+
auto negated1 = notEqual(5);
462+
auto negated2 = notEqual(10);
463+
464+
VELOX_ASSERT_FILTER(
465+
isNotNull(), makeOr(negated1->clone(), negated2->clone()));
466+
}
467+
}
468+
286469
TEST_F(ExprToSubfieldFilterTest, makeOrFilterDouble) {
287470
// a < 1.5 or a > 1.0 ==> not null
288471
{

0 commit comments

Comments
 (0)