|
18 | 18 | package org.apache.spark.sql.execution.datasources.parquet;
|
19 | 19 |
|
20 | 20 | import java.io.IOException;
|
| 21 | +import java.util.Optional; |
21 | 22 |
|
22 | 23 | import org.apache.hadoop.conf.Configuration;
|
23 |
| -import org.apache.hadoop.fs.FileStatus; |
24 |
| -import org.apache.hadoop.fs.Path; |
25 | 24 | import org.apache.parquet.HadoopReadOptions;
|
26 | 25 | import org.apache.parquet.ParquetReadOptions;
|
27 | 26 | import org.apache.parquet.format.converter.ParquetMetadataConverter;
|
|
37 | 36 | */
|
38 | 37 | public class ParquetFooterReader {
|
39 | 38 |
|
40 |
| - public static final boolean SKIP_ROW_GROUPS = true; |
41 |
| - public static final boolean WITH_ROW_GROUPS = false; |
42 |
| - |
43 | 39 | /**
|
44 |
| - * Reads footer for the input Parquet file 'split'. If 'skipRowGroup' is true, |
45 |
| - * this will skip reading the Parquet row group metadata. |
| 40 | + * Build a filter for reading footer of the input Parquet file 'split'. |
| 41 | + * If 'skipRowGroup' is true, this will skip reading the Parquet row group metadata. |
46 | 42 | *
|
47 | 43 | * @param file a part (i.e. "block") of a single file that should be read
|
48 |
| - * @param configuration hadoop configuration of file |
| 44 | + * @param hadoopConf hadoop configuration of file |
49 | 45 | * @param skipRowGroup If true, skip reading row groups;
|
50 | 46 | * if false, read row groups according to the file split range
|
51 | 47 | */
|
52 |
| - public static ParquetMetadata readFooter( |
53 |
| - Configuration configuration, |
54 |
| - PartitionedFile file, |
55 |
| - boolean skipRowGroup) throws IOException { |
56 |
| - long fileStart = file.start(); |
57 |
| - ParquetMetadataConverter.MetadataFilter filter; |
| 48 | + public static ParquetMetadataConverter.MetadataFilter buildFilter( |
| 49 | + Configuration hadoopConf, PartitionedFile file, boolean skipRowGroup) { |
58 | 50 | if (skipRowGroup) {
|
59 |
| - filter = ParquetMetadataConverter.SKIP_ROW_GROUPS; |
| 51 | + return ParquetMetadataConverter.SKIP_ROW_GROUPS; |
60 | 52 | } else {
|
61 |
| - filter = HadoopReadOptions.builder(configuration, file.toPath()) |
| 53 | + long fileStart = file.start(); |
| 54 | + return HadoopReadOptions.builder(hadoopConf, file.toPath()) |
62 | 55 | .withRange(fileStart, fileStart + file.length())
|
63 | 56 | .build()
|
64 | 57 | .getMetadataFilter();
|
65 | 58 | }
|
66 |
| - return readFooter(configuration, file.toPath(), filter); |
67 |
| - } |
68 |
| - |
69 |
| - public static ParquetMetadata readFooter(Configuration configuration, |
70 |
| - Path file, ParquetMetadataConverter.MetadataFilter filter) throws IOException { |
71 |
| - return readFooter(HadoopInputFile.fromPath(file, configuration), filter); |
72 |
| - } |
73 |
| - |
74 |
| - public static ParquetMetadata readFooter(Configuration configuration, |
75 |
| - FileStatus fileStatus, ParquetMetadataConverter.MetadataFilter filter) throws IOException { |
76 |
| - return readFooter(HadoopInputFile.fromStatus(fileStatus, configuration), filter); |
77 | 59 | }
|
78 | 60 |
|
79 |
| - private static ParquetMetadata readFooter(HadoopInputFile inputFile, |
| 61 | + public static ParquetMetadata readFooter( |
| 62 | + HadoopInputFile inputFile, |
80 | 63 | ParquetMetadataConverter.MetadataFilter filter) throws IOException {
|
81 |
| - ParquetReadOptions readOptions = |
82 |
| - HadoopReadOptions.builder(inputFile.getConfiguration(), inputFile.getPath()) |
| 64 | + ParquetReadOptions readOptions = HadoopReadOptions |
| 65 | + .builder(inputFile.getConfiguration(), inputFile.getPath()) |
83 | 66 | .withMetadataFilter(filter).build();
|
84 |
| - // Use try-with-resources to ensure fd is closed. |
85 |
| - try (ParquetFileReader fileReader = ParquetFileReader.open(inputFile, readOptions)) { |
| 67 | + try (var fileReader = ParquetFileReader.open(inputFile, readOptions)) { |
86 | 68 | return fileReader.getFooter();
|
87 | 69 | }
|
88 | 70 | }
|
| 71 | + |
| 72 | + /** |
| 73 | + * Decoding Parquet files generally involves two steps: |
| 74 | + * 1. read and resolve the metadata (footer), |
| 75 | + * 2. read and decode the row groups/column chunks. |
| 76 | + * <p> |
| 77 | + * It's possible to avoid opening the file twice by resuing the SeekableInputStream. |
| 78 | + * When keepInputStreamOpen is true, the caller takes responsibility to close the |
| 79 | + * SeekableInputStream. Currently, this is only supported by parquet vectorized reader. |
| 80 | + * |
| 81 | + * @param hadoopConf hadoop configuration of file |
| 82 | + * @param file a part (i.e. "block") of a single file that should be read |
| 83 | + * @param keepInputStreamOpen when true, keep the SeekableInputStream of file being open |
| 84 | + * @return if keepInputStreamOpen is true, the returned OpenedParquetFooter carries |
| 85 | + * Some(SeekableInputStream), otherwise None. |
| 86 | + */ |
| 87 | + public static OpenedParquetFooter openFileAndReadFooter( |
| 88 | + Configuration hadoopConf, |
| 89 | + PartitionedFile file, |
| 90 | + boolean keepInputStreamOpen) throws IOException { |
| 91 | + var readOptions = HadoopReadOptions.builder(hadoopConf, file.toPath()) |
| 92 | + // `keepInputStreamOpen` is true only when parquet vectorized reader is used |
| 93 | + // on the caller side, in such a case, the footer will be resued later on |
| 94 | + // reading row groups, so here must read row groups metadata ahead. |
| 95 | + // when false, the caller uses parquet-mr to read the file, only file metadata |
| 96 | + // is required on planning phase, and parquet-mr will read the footer again |
| 97 | + // on reading row groups. |
| 98 | + .withMetadataFilter(buildFilter(hadoopConf, file, !keepInputStreamOpen)) |
| 99 | + .build(); |
| 100 | + var inputFile = HadoopInputFile.fromPath(file.toPath(), hadoopConf); |
| 101 | + var inputStream = inputFile.newStream(); |
| 102 | + try (var fileReader = ParquetFileReader.open(inputFile, readOptions, inputStream)) { |
| 103 | + var footer = fileReader.getFooter(); |
| 104 | + if (keepInputStreamOpen) { |
| 105 | + fileReader.detachFileInputStream(); |
| 106 | + return new OpenedParquetFooter(footer, inputFile, Optional.of(inputStream)); |
| 107 | + } else { |
| 108 | + return new OpenedParquetFooter(footer, inputFile, Optional.empty()); |
| 109 | + } |
| 110 | + } |
| 111 | + } |
89 | 112 | }
|
0 commit comments