11package com .exasol .parquetio .reader ;
22
3+ import java .io .IOException ;
4+ import java .io .UncheckedIOException ;
5+ import java .util .Collections ;
6+ import java .util .List ;
7+ import java .util .function .Consumer ;
8+
39import com .exasol .errorreporting .ExaError ;
410import com .exasol .parquetio .data .ChunkInterval ;
511import com .exasol .parquetio .data .ChunkIntervalImpl ;
612import com .exasol .parquetio .data .Row ;
13+ import com .exasol .parquetio .merger .ChunkIntervalMerger ;
14+
715import org .apache .hadoop .conf .Configuration ;
816import org .apache .parquet .column .page .PageReadStore ;
917import org .apache .parquet .filter2 .compat .FilterCompat ;
1018import org .apache .parquet .hadoop .ParquetFileReader ;
1119import org .apache .parquet .hadoop .util .HadoopInputFile ;
12- import org .apache .parquet .io .*;
20+ import org .apache .parquet .io .ColumnIOFactory ;
21+ import org .apache .parquet .io .InputFile ;
22+ import org .apache .parquet .io .MessageColumnIO ;
23+ import org .apache .parquet .io .ParquetDecodingException ;
24+ import org .apache .parquet .io .RecordReader ;
1325import org .apache .parquet .io .api .RecordMaterializer ;
1426
15- import java .io .IOException ;
16- import java .io .UncheckedIOException ;
17- import java .util .Collections ;
18- import java .util .List ;
19- import java .util .function .Consumer ;
20-
2127/**
2228 * A Parquet file reader that reads only provided row groups.
2329 */
@@ -35,7 +41,7 @@ public class RowParquetChunkReader {
3541 *
3642 * Since no chunks are provided it reads all row groups of given file.
3743 *
38- * @param file a Parquet file
44+ * @param file a Parquet file
3945 */
4046 public RowParquetChunkReader (final InputFile file ) {
4147 this (file , List .of (new ChunkIntervalImpl (0L , getRowGroupSize (file ))));
@@ -60,15 +66,20 @@ public RowParquetChunkReader(final InputFile file, final long start, final long
6066 */
6167 public RowParquetChunkReader (final InputFile file , final List <ChunkInterval > chunks ) {
6268 this .file = file ;
63- this .chunks = chunks ;
69+ if (chunks == null || chunks .isEmpty ()) {
70+ throw new IllegalArgumentException (
71+ ExaError .messageBuilder ("E-PIOJ-5" ).message ("Chunk intervals list is empty." )
72+ .mitigation ("Please provide a valid list of Parquet file chunks." ).toString ());
73+ }
74+ this .chunks = new ChunkIntervalMerger ().sortAndMerge (chunks );
6475 final var readSupport = new RowReadSupport ();
6576 try (final var reader = ParquetFileReader .open (file )) {
6677 final var conf = getConfiguration (file );
6778 final var schema = reader .getFooter ().getFileMetaData ().getSchema ();
6879 final var readContext = readSupport .init (conf , Collections .emptyMap (), schema );
6980 this .recordMaterializer = readSupport .prepareForRead (conf , Collections .emptyMap (), schema , readContext );
7081 this .messageIO = new ColumnIOFactory (reader .getFooter ().getFileMetaData ().getCreatedBy ())//
71- .getColumnIO (readContext .getRequestedSchema (), schema , true );
82+ .getColumnIO (readContext .getRequestedSchema (), schema , true );
7283 } catch (IOException exception ) {
7384 throw new UncheckedIOException (getFileReadingErrorMessage (file ), exception );
7485 } catch (RuntimeException exception ) {
@@ -90,12 +101,9 @@ private static long getRowGroupSize(final InputFile file) {
90101 } catch (IOException exception ) {
91102 throw new UncheckedIOException (getFileReadingErrorMessage (file ), exception );
92103 } catch (RuntimeException exception ) {
93- throw new IllegalStateException (ExaError
94- .messageBuilder ("E-PIOJ-3" )
95- .message ("Error getting row group size from a Parquet {{FILE}} file." , file .toString ())
96- .mitigation (CHECK_FILE_MITIGATION ).toString (),
97- exception
98- );
104+ throw new IllegalStateException (ExaError .messageBuilder ("E-PIOJ-3" )
105+ .message ("Error getting row group size from a Parquet {{FILE}} file." , file .toString ())
106+ .mitigation (CHECK_FILE_MITIGATION ).toString (), exception );
99107 }
100108 }
101109
@@ -122,7 +130,8 @@ public void read(final Consumer<Row> rowConsumer) {
122130 }
123131 }
124132
125- private long moveToRowGroupPosition (final ParquetFileReader reader , final long currentPosition , final long startPosition ) {
133+ private long moveToRowGroupPosition (final ParquetFileReader reader , final long currentPosition ,
134+ final long startPosition ) {
126135 long position = currentPosition ;
127136 while (position < startPosition ) {
128137 reader .skipNextRowGroup ();
@@ -132,27 +141,25 @@ private long moveToRowGroupPosition(final ParquetFileReader reader, final long c
132141 }
133142
134143 private void consumeRows (final PageReadStore pageStore , final Consumer <Row > rowConsumer ) {
135- final RecordReader <Row > recordReader = messageIO .getRecordReader (pageStore , recordMaterializer , FilterCompat .NOOP );
144+ final RecordReader <Row > recordReader = messageIO .getRecordReader (pageStore , recordMaterializer ,
145+ FilterCompat .NOOP );
136146 consumeRecords (recordReader , pageStore .getRowCount (), rowConsumer );
137147 }
138148
139149 // This similar how Parquet reads records underneath,
140150 // https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java#L217
141- protected void consumeRecords (final RecordReader <Row > recordReader , final long totalRows , final Consumer <Row > rowConsumer ) {
151+ protected void consumeRecords (final RecordReader <Row > recordReader , final long totalRows ,
152+ final Consumer <Row > rowConsumer ) {
142153 long currentRow = 0 ;
143154 Row row ;
144155 while (currentRow < totalRows ) {
145156 currentRow += 1 ;
146157 try {
147158 row = recordReader .read ();
148159 } catch (RecordMaterializer .RecordMaterializationException exception ) {
149- throw new ParquetDecodingException (ExaError
150- .messageBuilder ("F-PIOJ-2" )
151- .message ("Failed to materialize a record from the Parquet file {{FILE}}." , this .file .toString ())
152- .mitigation (CHECK_FILE_MITIGATION )
153- .toString (),
154- exception
155- );
160+ throw new ParquetDecodingException (ExaError .messageBuilder ("F-PIOJ-2" )
161+ .message ("Failed to materialize a record from the Parquet file {{FILE}}." , this .file .toString ())
162+ .mitigation (CHECK_FILE_MITIGATION ).toString (), exception );
156163 }
157164 if (row == null ) { // Only happens with FilteredRecordReader at end of block
158165 break ;
@@ -164,11 +171,8 @@ protected void consumeRecords(final RecordReader<Row> recordReader, final long t
164171 }
165172
166173 private static String getFileReadingErrorMessage (final InputFile file ) {
167- return ExaError
168- .messageBuilder ("E-PIOJ-1" )
169- .message ("Failed to read Parquet file {{FILE}}." , file .toString ())
170- .mitigation (CHECK_FILE_MITIGATION )
171- .toString ();
174+ return ExaError .messageBuilder ("E-PIOJ-1" ).message ("Failed to read Parquet file {{FILE}}." , file .toString ())
175+ .mitigation (CHECK_FILE_MITIGATION ).toString ();
172176 }
173177
174178}
0 commit comments