2525import java .lang .reflect .InvocationTargetException ;
2626import java .lang .reflect .Method ;
2727import java .net .URI ;
28- import java .net .URISyntaxException ;
2928import java .nio .channels .Channels ;
3029import java .util .*;
3130
3231import scala .Option ;
32+ import scala .collection .JavaConverters ;
3333import scala .collection .Seq ;
3434import scala .collection .mutable .Buffer ;
3535
5252import org .apache .parquet .column .ColumnDescriptor ;
5353import org .apache .parquet .hadoop .metadata .BlockMetaData ;
5454import org .apache .parquet .hadoop .metadata .ParquetMetadata ;
55+ import org .apache .parquet .schema .GroupType ;
5556import org .apache .parquet .schema .MessageType ;
5657import org .apache .parquet .schema .Type ;
5758import org .apache .spark .TaskContext ;
6162import org .apache .spark .sql .comet .parquet .CometParquetReadSupport ;
6263import org .apache .spark .sql .comet .util .Utils$ ;
6364import org .apache .spark .sql .execution .datasources .PartitionedFile ;
65+ import org .apache .spark .sql .execution .datasources .parquet .ParquetColumn ;
6466import org .apache .spark .sql .execution .datasources .parquet .ParquetToSparkSchemaConverter ;
6567import org .apache .spark .sql .execution .metric .SQLMetric ;
6668import org .apache .spark .sql .types .DataType ;
7678import org .apache .comet .vector .CometVector ;
7779import org .apache .comet .vector .NativeUtil ;
7880
79- import static org .apache .comet .parquet .TypeUtil .isEqual ;
80-
8181/**
8282 * A vectorized Parquet reader that reads a Parquet file in a batched fashion.
8383 *
@@ -113,6 +113,7 @@ public class NativeBatchReader extends RecordReader<Void, ColumnarBatch> impleme
113113
114114 private StructType sparkSchema ;
115115 private StructType dataSchema ;
116+ MessageType fileSchema ;
116117 private MessageType requestedSchema ;
117118 private CometVector [] vectors ;
118119 private AbstractColumnReader [] columnReaders ;
@@ -124,6 +125,8 @@ public class NativeBatchReader extends RecordReader<Void, ColumnarBatch> impleme
124125 private ParquetMetadata footer ;
125126 private byte [] nativeFilter ;
126127
128+ private ParquetColumn parquetColumn ;
129+
127130 /**
128131 * Whether the native scan should always return decimal represented by 128 bits, regardless of its
129132 * precision. Normally, this should be true if native execution is enabled, since Arrow compute
@@ -229,7 +232,13 @@ public NativeBatchReader(AbstractColumnReader[] columnReaders) {
229232 * Initialize this reader. The reason we don't do it in the constructor is that we want to close
230233 * any resource hold by this reader when error happens during the initialization.
231234 */
232- public void init () throws URISyntaxException , IOException {
235+ public void init () throws Throwable {
236+
237+ conf .set ("spark.sql.parquet.binaryAsString" , "false" );
238+ conf .set ("spark.sql.parquet.int96AsTimestamp" , "false" );
239+ conf .set ("spark.sql.caseSensitive" , "false" );
240+ conf .set ("spark.sql.parquet.inferTimestampNTZ.enabled" , "true" );
241+ conf .set ("spark.sql.legacy.parquet.nanosAsLong" , "false" );
233242
234243 useDecimal128 =
235244 conf .getBoolean (
@@ -257,10 +266,11 @@ public void init() throws URISyntaxException, IOException {
257266 CometInputFile .fromPath (path , conf ), footer , readOptions , cometReadOptions , metrics )) {
258267
259268 requestedSchema = footer .getFileMetaData ().getSchema ();
260- MessageType fileSchema = requestedSchema ;
269+ fileSchema = requestedSchema ;
270+ ParquetToSparkSchemaConverter converter = new ParquetToSparkSchemaConverter (conf );
261271
262272 if (sparkSchema == null ) {
263- sparkSchema = new ParquetToSparkSchemaConverter ( conf ) .convert (requestedSchema );
273+ sparkSchema = converter .convert (requestedSchema );
264274 } else {
265275 requestedSchema =
266276 CometParquetReadSupport .clipParquetSchema (
@@ -269,9 +279,11 @@ public void init() throws URISyntaxException, IOException {
269279 throw new IllegalArgumentException (
270280 String .format (
271281 "Spark schema has %d columns while " + "Parquet schema has %d columns" ,
272- sparkSchema .size (), requestedSchema .getColumns (). size ()));
282+ sparkSchema .size (), requestedSchema .getFieldCount ()));
273283 }
274284 }
285+ this .parquetColumn =
286+ converter .convertParquetColumn (requestedSchema , Option .apply (this .sparkSchema ));
275287
276288 String timeZoneId = conf .get ("spark.sql.session.timeZone" );
277289 // Native code uses "UTC" always as the timeZoneId when converting from spark to arrow schema.
@@ -283,6 +295,8 @@ public void init() throws URISyntaxException, IOException {
283295 // Create Column readers
284296 List <Type > fields = requestedSchema .getFields ();
285297 List <Type > fileFields = fileSchema .getFields ();
298+ ParquetColumn [] parquetFields =
299+ JavaConverters .seqAsJavaList (parquetColumn .children ()).toArray (new ParquetColumn [0 ]);
286300 int numColumns = fields .size ();
287301 if (partitionSchema != null ) numColumns += partitionSchema .size ();
288302 columnReaders = new AbstractColumnReader [numColumns ];
@@ -332,9 +346,8 @@ public void init() throws URISyntaxException, IOException {
332346 } else if (optFileField .isPresent ()) {
333347 // The column we are reading may be a complex type in which case we check if each field in
334348 // the requested type is in the file type (and the same data type)
335- if (!isEqual (field , optFileField .get ())) {
336- throw new UnsupportedOperationException ("Schema evolution is not supported" );
337- }
349+ // This makes the same check as Spark's VectorizedParquetReader
350+ checkColumn (parquetFields [i ]);
338351 missingColumns [i ] = false ;
339352 } else {
340353 if (field .getRepetition () == Type .Repetition .REQUIRED ) {
@@ -407,6 +420,77 @@ public void init() throws URISyntaxException, IOException {
407420 isInitialized = true ;
408421 }
409422
423+ private void checkParquetType (ParquetColumn column ) throws IOException {
424+ String [] path = JavaConverters .seqAsJavaList (column .path ()).toArray (new String [0 ]);
425+ if (containsPath (fileSchema , path )) {
426+ if (column .isPrimitive ()) {
427+ ColumnDescriptor desc = column .descriptor ().get ();
428+ ColumnDescriptor fd = fileSchema .getColumnDescription (desc .getPath ());
429+ TypeUtil .checkParquetType (fd , column .sparkType ());
430+ } else {
431+ for (ParquetColumn childColumn : JavaConverters .seqAsJavaList (column .children ())) {
432+ checkColumn (childColumn );
433+ }
434+ }
435+ } else { // A missing column which is either primitive or complex
436+ if (column .required ()) {
437+ // Column is missing in data but the required data is non-nullable. This file is invalid.
438+ throw new IOException (
439+ "Required column is missing in data file. Col: " + Arrays .toString (path ));
440+ }
441+ }
442+ }
443+
444+ /**
445+ * Checks whether the given 'path' exists in 'parquetType'. The difference between this and {@link
446+ * MessageType#containsPath(String[])} is that the latter only support paths to leaf From Spark:
447+ * VectorizedParquetRecordReader Check whether a column from requested schema is missing from the
448+ * file schema, or whether it conforms to the type of the file schema.
449+ */
450+ private void checkColumn (ParquetColumn column ) throws IOException {
451+ String [] path = JavaConverters .seqAsJavaList (column .path ()).toArray (new String [0 ]);
452+ if (containsPath (fileSchema , path )) {
453+ if (column .isPrimitive ()) {
454+ ColumnDescriptor desc = column .descriptor ().get ();
455+ ColumnDescriptor fd = fileSchema .getColumnDescription (desc .getPath ());
456+ if (!fd .equals (desc )) {
457+ throw new UnsupportedOperationException ("Schema evolution not supported." );
458+ }
459+ } else {
460+ for (ParquetColumn childColumn : JavaConverters .seqAsJavaList (column .children ())) {
461+ checkColumn (childColumn );
462+ }
463+ }
464+ } else { // A missing column which is either primitive or complex
465+ if (column .required ()) {
466+ // Column is missing in data but the required data is non-nullable. This file is invalid.
467+ throw new IOException (
468+ "Required column is missing in data file. Col: " + Arrays .toString (path ));
469+ }
470+ }
471+ }
472+
473+ /**
474+ * Checks whether the given 'path' exists in 'parquetType'. The difference between this and {@link
475+ * MessageType#containsPath(String[])} is that the latter only support paths to leaf nodes, while
476+ * this support paths both to leaf and non-leaf nodes.
477+ */
478+ private boolean containsPath (Type parquetType , String [] path ) {
479+ return containsPath (parquetType , path , 0 );
480+ }
481+
482+ private boolean containsPath (Type parquetType , String [] path , int depth ) {
483+ if (path .length == depth ) return true ;
484+ if (parquetType instanceof GroupType ) {
485+ String fieldName = path [depth ];
486+ GroupType parquetGroupType = (GroupType ) parquetType ;
487+ if (parquetGroupType .containsField (fieldName )) {
488+ return containsPath (parquetGroupType .getType (fieldName ), path , depth + 1 );
489+ }
490+ }
491+ return false ;
492+ }
493+
410494 public void setSparkSchema (StructType schema ) {
411495 this .sparkSchema = schema ;
412496 }
@@ -532,7 +616,10 @@ private int loadNextBatch() throws Throwable {
532616 if (importer != null ) importer .close ();
533617 importer = new CometSchemaImporter (ALLOCATOR );
534618
535- List <ColumnDescriptor > columns = requestedSchema .getColumns ();
619+ for (ParquetColumn childColumn : JavaConverters .seqAsJavaList (parquetColumn .children ())) {
620+ checkParquetType (childColumn );
621+ }
622+
536623 List <Type > fields = requestedSchema .getFields ();
537624 for (int i = 0 ; i < fields .size (); i ++) {
538625 if (!missingColumns [i ]) {
0 commit comments