@@ -116,6 +116,7 @@ public class DictionaryFilterTest {
116116 + "required binary fallback_binary_field; "
117117 + "required int96 int96_field; "
118118 + "repeated binary repeated_binary_field;"
119+ + "repeated binary repeated_binary_field_high_cardinality;" // high cardinality, no dict encoding produced
119120 + "} " );
120121
121122 private static final String ALPHABET = "abcdefghijklmnopqrstuvwxyz" ;
@@ -197,6 +198,10 @@ private static void writeData(SimpleGroupFactory f, ParquetWriter<Group> writer)
197198 group .append ("optional_single_value_field" , "sharp" );
198199 }
199200
201+ for (char letter : ALPHABET .toCharArray ()) {
202+ group = group .append ("repeated_binary_field_high_cardinality" , String .valueOf (letter ));
203+ }
204+
200205 writer .write (group );
201206 }
202207 writer .close ();
@@ -218,6 +223,7 @@ private static void prepareFile(WriterVersion version, Path file) throws IOExcep
218223 .withRowGroupSize (1024 * 1024 )
219224 .withPageSize (1024 )
220225 .enableDictionaryEncoding ()
226+ .withDictionaryEncoding ("repeated_binary_field_high_cardinality" , false )
221227 .withDictionaryPageSize (2 * 1024 )
222228 .withConf (conf )
223229 .build ();
@@ -510,10 +516,9 @@ public void testGtEqDouble() throws Exception {
510516 @ Test
511517 public void testSizeBinary () throws Exception {
512518 // repeated_binary_field dict has 26 distinct values
513- BinaryColumn b = binaryColumn ("repeated_binary_field" );
519+ final BinaryColumn b = binaryColumn ("repeated_binary_field" );
514520
515- // DictionaryFilter knows that `repeated_binary_field` column has at least 26 element values spread across
516- // records
521+ // DictionaryFilter infers that col `repeated_binary_field` has >= 26 values spread across row group
517522 assertTrue (canDrop (size (b , Operators .Size .Operator .EQ , 0 ), ccmd , dictionaries ));
518523 assertTrue (canDrop (size (b , Operators .Size .Operator .LT , 1 ), ccmd , dictionaries ));
519524 assertTrue (canDrop (size (b , Operators .Size .Operator .LTE , 0 ), ccmd , dictionaries ));
@@ -522,8 +527,8 @@ public void testSizeBinary() throws Exception {
522527 assertFalse (canDrop (size (b , Operators .Size .Operator .GT , 0 ), ccmd , dictionaries ));
523528 assertFalse (canDrop (size (b , Operators .Size .Operator .GTE , 1 ), ccmd , dictionaries ));
524529
525- // If column doesn't exist in meta, it should be treated as having size 0
526- BinaryColumn nonExistentColumn = binaryColumn ("nonexistant_col" );
530+ // If column doesn't exist in meta, it has no values and can be treated as having size 0
531+ final BinaryColumn nonExistentColumn = binaryColumn ("nonexistant_col" );
527532
528533 assertTrue (canDrop (size (nonExistentColumn , Operators .Size .Operator .GT , 0 ), ccmd , dictionaries ));
529534 assertTrue (canDrop (size (nonExistentColumn , Operators .Size .Operator .GTE , 1 ), ccmd , dictionaries ));
@@ -532,6 +537,16 @@ public void testSizeBinary() throws Exception {
532537 assertFalse (canDrop (size (nonExistentColumn , Operators .Size .Operator .LT , 1 ), ccmd , dictionaries ));
533538 assertFalse (canDrop (size (nonExistentColumn , Operators .Size .Operator .LTE , 0 ), ccmd , dictionaries ));
534539 assertFalse (canDrop (size (nonExistentColumn , Operators .Size .Operator .EQ , 0 ), ccmd , dictionaries ));
540+
541+ // If column exists but doesn't have a dict, we cannot infer anything about its size
542+ final BinaryColumn noDictColumn = binaryColumn ("repeated_binary_field_high_cardinality" );
543+
544+ assertFalse (canDrop (size (noDictColumn , Operators .Size .Operator .GT , 0 ), ccmd , dictionaries ));
545+ assertFalse (canDrop (size (noDictColumn , Operators .Size .Operator .GTE , 0 ), ccmd , dictionaries ));
546+ assertFalse (canDrop (size (noDictColumn , Operators .Size .Operator .EQ , 1 ), ccmd , dictionaries ));
547+ assertFalse (canDrop (size (noDictColumn , Operators .Size .Operator .LT , 1 ), ccmd , dictionaries ));
548+ assertFalse (canDrop (size (noDictColumn , Operators .Size .Operator .LTE , 0 ), ccmd , dictionaries ));
549+ assertFalse (canDrop (size (noDictColumn , Operators .Size .Operator .EQ , 0 ), ccmd , dictionaries ));
535550 }
536551
537552 @ Test
0 commit comments