Skip to content

Commit 956a740

Browse files
committed
PARQUET-34: Test DictionaryFilter for column with skipped dict encoding
1 parent 79565a3 commit 956a740

File tree

1 file changed

+20
-5
lines changed

1 file changed

+20
-5
lines changed

parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ public class DictionaryFilterTest {
116116
+ "required binary fallback_binary_field; "
117117
+ "required int96 int96_field; "
118118
+ "repeated binary repeated_binary_field;"
119+
+ "repeated binary repeated_binary_field_high_cardinality;" // high cardinality, no dict encoding produced
119120
+ "} ");
120121

121122
private static final String ALPHABET = "abcdefghijklmnopqrstuvwxyz";
@@ -197,6 +198,10 @@ private static void writeData(SimpleGroupFactory f, ParquetWriter<Group> writer)
197198
group.append("optional_single_value_field", "sharp");
198199
}
199200

201+
for (char letter : ALPHABET.toCharArray()) {
202+
group = group.append("repeated_binary_field_high_cardinality", String.valueOf(letter));
203+
}
204+
200205
writer.write(group);
201206
}
202207
writer.close();
@@ -218,6 +223,7 @@ private static void prepareFile(WriterVersion version, Path file) throws IOExcep
218223
.withRowGroupSize(1024 * 1024)
219224
.withPageSize(1024)
220225
.enableDictionaryEncoding()
226+
.withDictionaryEncoding("repeated_binary_field_high_cardinality", false)
221227
.withDictionaryPageSize(2 * 1024)
222228
.withConf(conf)
223229
.build();
@@ -510,10 +516,9 @@ public void testGtEqDouble() throws Exception {
510516
@Test
511517
public void testSizeBinary() throws Exception {
512518
// repeated_binary_field dict has 26 distinct values
513-
BinaryColumn b = binaryColumn("repeated_binary_field");
519+
final BinaryColumn b = binaryColumn("repeated_binary_field");
514520

515-
// DictionaryFilter knows that `repeated_binary_field` column has at least 26 element values spread across
516-
// records
521+
// DictionaryFilter infers that col `repeated_binary_field` has >= 26 values spread across row group
517522
assertTrue(canDrop(size(b, Operators.Size.Operator.EQ, 0), ccmd, dictionaries));
518523
assertTrue(canDrop(size(b, Operators.Size.Operator.LT, 1), ccmd, dictionaries));
519524
assertTrue(canDrop(size(b, Operators.Size.Operator.LTE, 0), ccmd, dictionaries));
@@ -522,8 +527,8 @@ public void testSizeBinary() throws Exception {
522527
assertFalse(canDrop(size(b, Operators.Size.Operator.GT, 0), ccmd, dictionaries));
523528
assertFalse(canDrop(size(b, Operators.Size.Operator.GTE, 1), ccmd, dictionaries));
524529

525-
// If column doesn't exist in meta, it should be treated as having size 0
526-
BinaryColumn nonExistentColumn = binaryColumn("nonexistant_col");
530+
// If column doesn't exist in meta, it has no values and can be treated as having size 0
531+
final BinaryColumn nonExistentColumn = binaryColumn("nonexistant_col");
527532

528533
assertTrue(canDrop(size(nonExistentColumn, Operators.Size.Operator.GT, 0), ccmd, dictionaries));
529534
assertTrue(canDrop(size(nonExistentColumn, Operators.Size.Operator.GTE, 1), ccmd, dictionaries));
@@ -532,6 +537,16 @@ public void testSizeBinary() throws Exception {
532537
assertFalse(canDrop(size(nonExistentColumn, Operators.Size.Operator.LT, 1), ccmd, dictionaries));
533538
assertFalse(canDrop(size(nonExistentColumn, Operators.Size.Operator.LTE, 0), ccmd, dictionaries));
534539
assertFalse(canDrop(size(nonExistentColumn, Operators.Size.Operator.EQ, 0), ccmd, dictionaries));
540+
541+
// If column exists but doesn't have a dict, we cannot infer anything about its size
542+
final BinaryColumn noDictColumn = binaryColumn("repeated_binary_field_high_cardinality");
543+
544+
assertFalse(canDrop(size(noDictColumn, Operators.Size.Operator.GT, 0), ccmd, dictionaries));
545+
assertFalse(canDrop(size(noDictColumn, Operators.Size.Operator.GTE, 0), ccmd, dictionaries));
546+
assertFalse(canDrop(size(noDictColumn, Operators.Size.Operator.EQ, 1), ccmd, dictionaries));
547+
assertFalse(canDrop(size(noDictColumn, Operators.Size.Operator.LT, 1), ccmd, dictionaries));
548+
assertFalse(canDrop(size(noDictColumn, Operators.Size.Operator.LTE, 0), ccmd, dictionaries));
549+
assertFalse(canDrop(size(noDictColumn, Operators.Size.Operator.EQ, 0), ccmd, dictionaries));
535550
}
536551

537552
@Test

0 commit comments

Comments
 (0)