Skip to content

Commit 79565a3

Browse files
committed
PARQUET-34: Fix DictionaryFilter
1 parent ea20875 commit 79565a3

File tree

2 files changed

+17
-15
lines changed

2 files changed

+17
-15
lines changed

parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -508,16 +508,16 @@ public Boolean visit(Size size) {
508508
try {
509509
// We know the block has at least as many array elements as the dictionary sizes
510510
final Set<?> dict = expandDictionary(meta);
511-
if (dict == null) {
511+
512+
// If the column doesn't have a dictionary encoding, we can't infer anything about its size
513+
if (dict == null || dict.isEmpty()) {
512514
return BLOCK_MIGHT_MATCH;
513515
}
514-
int numDistinctValues = dict.size();
515-
final boolean blockCannotMatch = size.filter(
516-
(eq) -> eq < numDistinctValues,
517-
(lt) -> lt <= numDistinctValues,
518-
(lte) -> lte < numDistinctValues,
519-
(gt) -> false,
520-
(gte) -> false);
516+
517+
// Column has at least (nonempty) dict.size() values spread across over all records;
518+
// predicates that match empty arrays cannot match
519+
final boolean blockCannotMatch =
520+
size.filter((eq) -> eq == 0, (lt) -> lt == 1, (lte) -> lte <= 1, (gt) -> false, (gte) -> false);
521521

522522
return blockCannotMatch ? BLOCK_CANNOT_MATCH : BLOCK_MIGHT_MATCH;
523523
} catch (IOException e) {

parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -509,16 +509,18 @@ public void testGtEqDouble() throws Exception {
509509

510510
@Test
511511
public void testSizeBinary() throws Exception {
512+
// repeated_binary_field dict has 26 distinct values
512513
BinaryColumn b = binaryColumn("repeated_binary_field");
513514

514-
// DictionaryFilter knows that `repeated_binary_field` column has at least 26 element values
515-
assertFalse(canDrop(size(b, Operators.Size.Operator.GT, 26), ccmd, dictionaries));
516-
assertFalse(canDrop(size(b, Operators.Size.Operator.GTE, 27), ccmd, dictionaries));
517-
assertFalse(canDrop(size(b, Operators.Size.Operator.EQ, 27), ccmd, dictionaries));
515+
// DictionaryFilter knows that `repeated_binary_field` column has at least 26 element values spread across
516+
// records
517+
assertTrue(canDrop(size(b, Operators.Size.Operator.EQ, 0), ccmd, dictionaries));
518+
assertTrue(canDrop(size(b, Operators.Size.Operator.LT, 1), ccmd, dictionaries));
519+
assertTrue(canDrop(size(b, Operators.Size.Operator.LTE, 0), ccmd, dictionaries));
518520

519-
assertTrue(canDrop(size(b, Operators.Size.Operator.LT, 26), ccmd, dictionaries));
520-
assertTrue(canDrop(size(b, Operators.Size.Operator.LTE, 25), ccmd, dictionaries));
521-
assertTrue(canDrop(size(b, Operators.Size.Operator.EQ, 25), ccmd, dictionaries));
521+
assertFalse(canDrop(size(b, Operators.Size.Operator.EQ, 30), ccmd, dictionaries));
522+
assertFalse(canDrop(size(b, Operators.Size.Operator.GT, 0), ccmd, dictionaries));
523+
assertFalse(canDrop(size(b, Operators.Size.Operator.GTE, 1), ccmd, dictionaries));
522524

523525
// If column doesn't exist in meta, it should be treated as having size 0
524526
BinaryColumn nonExistentColumn = binaryColumn("nonexistant_col");

0 commit comments

Comments
 (0)