@@ -109,6 +109,7 @@ public class ParquetReader
109109 private static final int BATCH_SIZE_GROWTH_FACTOR = 2 ;
110110 public static final String PARQUET_CODEC_METRIC_PREFIX = "ParquetReaderCompressionFormat_" ;
111111 public static final String COLUMN_INDEX_ROWS_FILTERED = "ParquetColumnIndexRowsFiltered" ;
112+ public static final String PARQUET_READER_DICTIONARY_FILTERED_ROWGROUPS = "ParquetReaderDictionaryFilteredRowGroups" ;
112113
113114 private final Optional <String > fileCreatedBy ;
114115 private final List <RowGroupInfo > rowGroups ;
@@ -151,6 +152,7 @@ public class ParquetReader
151152 private int currentPageId ;
152153
153154 private long columnIndexRowsFiltered = -1 ;
155+ private long dictionaryFilteredRowGroups ;
154156 private final Optional <FileDecryptionContext > decryptionContext ;
155157
156158 public ParquetReader (
@@ -467,38 +469,67 @@ private int nextBatch()
467469 private boolean advanceToNextRowGroup ()
468470 throws IOException
469471 {
470- currentRowGroupMemoryContext .close ();
471- currentRowGroupMemoryContext = memoryContext .newAggregatedMemoryContext ();
472- freeCurrentRowGroupBuffers ();
473-
474- if (currentRowGroup >= 0 && rowGroupStatisticsValidation .isPresent ()) {
475- StatisticsValidation statisticsValidation = rowGroupStatisticsValidation .get ();
476- writeValidation .orElseThrow ().validateRowGroupStatistics (dataSource .getId (), currentBlockMetadata , statisticsValidation .build ());
477- statisticsValidation .reset ();
478- }
479-
480- currentRowGroup ++;
481- if (currentRowGroup == rowGroups .size ()) {
482- return false ;
483- }
484- RowGroupInfo rowGroupInfo = rowGroups .get (currentRowGroup );
485- currentBlockMetadata = rowGroupInfo .prunedBlockMetadata ();
486- firstRowIndexInGroup = rowGroupInfo .fileRowOffset ();
487- currentGroupRowCount = currentBlockMetadata .getRowCount ();
488- FilteredRowRanges currentGroupRowRanges = blockRowRanges [currentRowGroup ];
489- log .debug ("advanceToNextRowGroup dataSource %s, currentRowGroup %d, rowRanges %s, currentBlockMetadata %s" , dataSource .getId (), currentRowGroup , currentGroupRowRanges , currentBlockMetadata );
490- if (currentGroupRowRanges != null ) {
491- long rowCount = currentGroupRowRanges .getRowCount ();
492- columnIndexRowsFiltered += currentGroupRowCount - rowCount ;
493- if (rowCount == 0 ) {
494- // Filters on multiple columns with page indexes may yield non-overlapping row ranges and eliminate the entire row group.
495- // Advance to next row group to ensure that we don't return a null Page and close the page source before all row groups are processed
496- return advanceToNextRowGroup ();
472+ while (currentRowGroup < rowGroups .size ()) {
473+ currentRowGroupMemoryContext .close ();
474+ currentRowGroupMemoryContext = memoryContext .newAggregatedMemoryContext ();
475+ freeCurrentRowGroupBuffers ();
476+
477+ if (currentRowGroup >= 0 && rowGroupStatisticsValidation .isPresent ()) {
478+ StatisticsValidation statisticsValidation = rowGroupStatisticsValidation .get ();
479+ writeValidation .orElseThrow ().validateRowGroupStatistics (dataSource .getId (), currentBlockMetadata , statisticsValidation .build ());
480+ statisticsValidation .reset ();
481+ }
482+
483+ currentRowGroup ++;
484+ if (currentRowGroup == rowGroups .size ()) {
485+ return false ;
486+ }
487+ RowGroupInfo rowGroupInfo = rowGroups .get (currentRowGroup );
488+ currentBlockMetadata = rowGroupInfo .prunedBlockMetadata ();
489+ firstRowIndexInGroup = rowGroupInfo .fileRowOffset ();
490+ currentGroupRowCount = currentBlockMetadata .getRowCount ();
491+ FilteredRowRanges currentGroupRowRanges = blockRowRanges [currentRowGroup ];
492+ log .debug ("advanceToNextRowGroup dataSource %s, currentRowGroup %d, rowRanges %s, currentBlockMetadata %s" , dataSource .getId (), currentRowGroup , currentGroupRowRanges , currentBlockMetadata );
493+ if (currentGroupRowRanges != null ) {
494+ long rowCount = currentGroupRowRanges .getRowCount ();
495+ columnIndexRowsFiltered += currentGroupRowCount - rowCount ;
496+ if (rowCount == 0 ) {
497+ // Filters on multiple columns with page indexes may yield non-overlapping row ranges and eliminate the entire row group.
498+ // Advance to next row group to ensure that we don't return a null Page and close the page source before all row groups are processed
499+ continue ;
500+ }
501+ currentGroupRowCount = rowCount ;
502+ }
503+ nextRowInGroup = 0L ;
504+ initializeColumnReaders ();
505+
506+ // check dictionary predicate matches, or skip row group
507+ if (!dictionaryPredicateMatch (rowGroupInfo )) {
508+ dictionaryFilteredRowGroups ++;
509+ continue ;
510+ }
511+ return true ;
512+ }
513+ return false ;
514+ }
515+
516+ private boolean dictionaryPredicateMatch (RowGroupInfo rowGroupInfo )
517+ {
518+ for (PrimitiveField field : primitiveFields ) {
519+ // check presence of indexPredicate and don't eagerly initializePageReader if it's not present
520+ if (rowGroupInfo .indexPredicate ().isPresent ()) {
521+ try {
522+ initializePageReader (field );
523+ boolean match = columnReaders .get (field .getId ()).dictionaryPredicateMatch (rowGroupInfo );
524+ if (!match ) {
525+ return false ;
526+ }
527+ }
528+ catch (Exception e ) {
529+ log .error (e , "Error while matching dictionary predicate for field " + field );
530+ }
497531 }
498- currentGroupRowCount = rowCount ;
499532 }
500- nextRowInGroup = 0L ;
501- initializeColumnReaders ();
502533 return true ;
503534 }
504535
@@ -654,29 +685,10 @@ private FilteredOffsetIndex getFilteredOffsetIndex(FilteredRowRanges rowRanges,
654685 private ColumnChunk readPrimitive (PrimitiveField field )
655686 throws IOException
656687 {
657- ColumnDescriptor columnDescriptor = field .getDescriptor ();
658688 int fieldId = field .getId ();
659689 ColumnReader columnReader = columnReaders .get (fieldId );
660690 if (!columnReader .hasPageReader ()) {
661- validateParquet (currentBlockMetadata .getRowCount () > 0 , dataSource .getId (), "Row group has 0 rows" );
662- ColumnChunkMetadata metadata = currentBlockMetadata .getColumnChunkMetaData (columnDescriptor );
663- FilteredRowRanges rowRanges = blockRowRanges [currentRowGroup ];
664- OffsetIndex offsetIndex = null ;
665- if (rowRanges != null ) {
666- offsetIndex = getFilteredOffsetIndex (rowRanges , currentRowGroup , currentBlockMetadata .getRowCount (), metadata .getPath ());
667- }
668- ChunkedInputStream columnChunkInputStream = chunkReaders .get (new ChunkKey (fieldId , currentRowGroup ));
669- columnReader .setPageReader (
670- createPageReader (
671- dataSource .getId (),
672- columnChunkInputStream ,
673- metadata ,
674- columnDescriptor ,
675- offsetIndex ,
676- fileCreatedBy ,
677- decryptionContext ,
678- options .getMaxPageReadSize ().toBytes ()),
679- Optional .ofNullable (rowRanges ));
691+ initializePageReader (field );
680692 }
681693 ColumnChunk columnChunk = columnReader .readPrimitive ();
682694
@@ -692,6 +704,34 @@ private ColumnChunk readPrimitive(PrimitiveField field)
692704 return columnChunk ;
693705 }
694706
707+ private void initializePageReader (PrimitiveField field )
708+ throws ParquetCorruptionException
709+ {
710+ ColumnDescriptor columnDescriptor = field .getDescriptor ();
711+ int fieldId = field .getId ();
712+ ColumnReader columnReader = columnReaders .get (fieldId );
713+ checkState (!columnReader .hasPageReader (), "Page reader already initialized" );
714+ validateParquet (currentBlockMetadata .getRowCount () > 0 , dataSource .getId (), "Row group has 0 rows" );
715+ ColumnChunkMetadata metadata = currentBlockMetadata .getColumnChunkMetaData (columnDescriptor );
716+ FilteredRowRanges rowRanges = blockRowRanges [currentRowGroup ];
717+ OffsetIndex offsetIndex = null ;
718+ if (rowRanges != null ) {
719+ offsetIndex = getFilteredOffsetIndex (rowRanges , currentRowGroup , currentBlockMetadata .getRowCount (), metadata .getPath ());
720+ }
721+ ChunkedInputStream columnChunkInputStream = chunkReaders .get (new ChunkKey (fieldId , currentRowGroup ));
722+ columnReader .setPageReader (
723+ createPageReader (
724+ dataSource .getId (),
725+ columnChunkInputStream ,
726+ metadata ,
727+ columnDescriptor ,
728+ offsetIndex ,
729+ fileCreatedBy ,
730+ decryptionContext ,
731+ options .getMaxPageReadSize ().toBytes ()),
732+ Optional .ofNullable (rowRanges ));
733+ }
734+
695735 public List <Column > getColumnFields ()
696736 {
697737 return columnFields ;
@@ -704,6 +744,7 @@ public Metrics getMetrics()
704744 if (columnIndexRowsFiltered >= 0 ) {
705745 metrics .put (COLUMN_INDEX_ROWS_FILTERED , new LongCount (columnIndexRowsFiltered ));
706746 }
747+ metrics .put (PARQUET_READER_DICTIONARY_FILTERED_ROWGROUPS , new LongCount (dictionaryFilteredRowGroups ));
707748 metrics .putAll (dataSource .getMetrics ().getMetrics ());
708749
709750 return new Metrics (metrics .buildOrThrow ());
0 commit comments