[SPARK-23822][SQL] Improve error message for Parquet schema mismatches

yuchenhuo · Robert Kruszewski · commit e98b55331762 · 2018-04-06T18:02:37.000-07:00
## What changes were proposed in this pull request? This pull request tries to improve the error message for spark while reading parquet files with different schemas, e.g. One with a STRING column and the other with a INT column. A new ParquetSchemaColumnConvertNotSupportedException is added to replace the old UnsupportedOperationException. The Exception is again wrapped in FileScanRdd.scala to throw a more a general QueryExecutionException with the actual parquet file name which trigger the exception. ## How was this patch tested? Unit tests added to check the new exception and verify the error messages. Also manually tested with two parquet with different schema to check the error message. <img width="1125" alt="screen shot 2018-03-30 at 4 03 04 pm" src="https://user-images.githubusercontent.com/37087310/38156580-dd58a140-3433-11e8-973a-b816d859fbe1.png"> Author: Yuchen Huo <yuchen.huo@databricks.com> Closes apache#20953 from yuchenhuo/SPARK-23822.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/SchemaColumnConvertNotSupportedException.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/SchemaColumnConvertNotSupportedException.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * Exception thrown when the parquet reader find column type mismatches.
+ */
+@InterfaceStability.Unstable
+public class SchemaColumnConvertNotSupportedException extends RuntimeException {
+
+  /**
+   * Name of the column which cannot be converted.
+   */
+  private String column;
+  /**
+   * Physical column type in the actual parquet file.
+   */
+  private String physicalType;
+  /**
+   * Logical column type in the parquet schema the parquet reader use to parse all files.
+   */
+  private String logicalType;
+
+  public String getColumn() {
+    return column;
+  }
+
+  public String getPhysicalType() {
+    return physicalType;
+  }
+
+  public String getLogicalType() {
+    return logicalType;
+  }
+
+  public SchemaColumnConvertNotSupportedException(
+      String column,
+      String physicalType,
+      String logicalType) {
+    super();
+    this.column = column;
+    this.physicalType = physicalType;
+    this.logicalType = logicalType;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -22,6 +22,7 @@
 import static org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.createRLEIterator;
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.TimeZone;
 import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.column.ColumnDescriptor;
@@ -37,6 +38,7 @@
 import org.apache.parquet.schema.OriginalType;
 import org.apache.parquet.schema.PrimitiveType;
 import org.apache.spark.sql.catalyst.util.DateTimeUtils;
+import org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException;
 import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.DecimalType;
@@ -233,6 +235,18 @@ private boolean shouldConvertTimestamps() {
     return convertTz != null && !convertTz.equals(UTC);
   }
 
+  /**
+   * Helper function to construct exception for parquet schema mismatch.
+   */
+  private SchemaColumnConvertNotSupportedException constructConvertNotSupportedException(
+      ColumnDescriptor descriptor,
+      WritableColumnVector column) {
+    return new SchemaColumnConvertNotSupportedException(
+      Arrays.toString(descriptor.getPath()),
+      descriptor.getType().toString(),
+      column.dataType().toString());
+  }
+
   /**
    * Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`.
    */
@@ -263,7 +277,7 @@ private void decodeDictionaryIds(
             }
           }
         } else {
-          throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+          throw constructConvertNotSupportedException(descriptor, column);
         }
         break;
 
@@ -284,7 +298,7 @@ private void decodeDictionaryIds(
             }
           }
         } else {
-          throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+          throw constructConvertNotSupportedException(descriptor, column);
         }
         break;
 
@@ -323,7 +337,7 @@ private void decodeDictionaryIds(
             }
           }
         } else {
-          throw new UnsupportedOperationException();
+          throw constructConvertNotSupportedException(descriptor, column);
         }
         break;
       case BINARY:
@@ -362,7 +376,7 @@ private void decodeDictionaryIds(
             }
           }
         } else {
-          throw new UnsupportedOperationException();
+          throw constructConvertNotSupportedException(descriptor, column);
         }
         break;
 
@@ -377,7 +391,9 @@ private void decodeDictionaryIds(
    */
 
   private void readBooleanBatch(int rowId, int num, WritableColumnVector column) {
-    assert(column.dataType() == DataTypes.BooleanType);
+    if (column.dataType() != DataTypes.BooleanType) {
+      throw constructConvertNotSupportedException(descriptor, column);
+    }
     defColumn.readBooleans(
         num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
   }
@@ -396,7 +412,7 @@ private void readIntBatch(int rowId, int num, WritableColumnVector column) {
       defColumn.readShorts(
           num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
     } else {
-      throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
@@ -416,7 +432,7 @@ private void readLongBatch(int rowId, int num, WritableColumnVector column) {
         }
       }
     } else {
-      throw new UnsupportedOperationException("Unsupported conversion to: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
@@ -427,7 +443,7 @@ private void readFloatBatch(int rowId, int num, WritableColumnVector column) {
       defColumn.readFloats(
           num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
     } else {
-      throw new UnsupportedOperationException("Unsupported conversion to: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
@@ -438,7 +454,7 @@ private void readDoubleBatch(int rowId, int num, WritableColumnVector column) {
       defColumn.readDoubles(
           num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
     } else {
-      throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
@@ -473,7 +489,7 @@ private void readBinaryBatch(int rowId, int num, WritableColumnVector column) {
         }
       }
     } else {
-      throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
@@ -512,7 +528,7 @@ private void readFixedLenByteArrayBatch(
         }
       }
     } else {
-      throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());
+      throw constructConvertNotSupportedException(descriptor, column);
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecutionException.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecutionException.scala
@@ -17,4 +17,5 @@
 
 package org.apache.spark.sql.execution
 
-class QueryExecutionException(message: String) extends Exception(message)
+class QueryExecutionException(message: String, cause: Throwable = null)
+  extends Exception(message, cause)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -21,11 +21,14 @@ import java.io.{FileNotFoundException, IOException}
 
 import scala.collection.mutable
 
+import org.apache.parquet.io.ParquetDecodingException
+
 import org.apache.spark.{Partition => RDDPartition, TaskContext, TaskKilledException}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{InputFileBlockHolder, RDD}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.QueryExecutionException
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.NextIterator
 
@@ -179,7 +182,23 @@ class FileScanRDD(
             currentIterator = readCurrentFile()
           }
 
-          hasNext
+          try {
+            hasNext
+          } catch {
+            case e: SchemaColumnConvertNotSupportedException =>
+              val message = "Parquet column cannot be converted in " +
+                s"file ${currentFile.filePath}. Column: ${e.getColumn}, " +
+                s"Expected: ${e.getLogicalType}, Found: ${e.getPhysicalType}"
+              throw new QueryExecutionException(message, e)
+            case e: ParquetDecodingException =>
+              if (e.getMessage.contains("Can not read value at")) {
+                val message = "Encounter error while reading parquet files. " +
+                  "One possible cause: Parquet column cannot be converted in the " +
+                  "corresponding files. Details: "
+                throw new QueryExecutionException(message, e)
+              }
+              throw e
+          }
         } else {
           currentFile = null
           InputFileBlockHolder.unset()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -20,10 +20,13 @@ package org.apache.spark.sql.execution.datasources.parquet
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
+import org.apache.parquet.io.ParquetDecodingException
 import org.apache.parquet.schema.{MessageType, MessageTypeParser}
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.execution.QueryExecutionException
+import org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -382,6 +385,58 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     }
   }
 
+  // =======================================
+  // Tests for parquet schema mismatch error
+  // =======================================
+  def testSchemaMismatch(path: String, vectorizedReaderEnabled: Boolean): SparkException = {
+    import testImplicits._
+
+    var e: SparkException = null
+    withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorizedReaderEnabled.toString) {
+      // Create two parquet files with different schemas in the same folder
+      Seq(("bcd", 2)).toDF("a", "b").coalesce(1).write.mode("overwrite").parquet(s"$path/parquet")
+      Seq((1, "abc")).toDF("a", "b").coalesce(1).write.mode("append").parquet(s"$path/parquet")
+
+      e = intercept[SparkException] {
+        spark.read.parquet(s"$path/parquet").collect()
+      }
+    }
+    e
+  }
+
+  test("schema mismatch failure error message for parquet reader") {
+    withTempPath { dir =>
+      val e = testSchemaMismatch(dir.getCanonicalPath, vectorizedReaderEnabled = false)
+      val expectedMessage = "Encounter error while reading parquet files. " +
+        "One possible cause: Parquet column cannot be converted in the corresponding " +
+        "files. Details:"
+      assert(e.getCause.isInstanceOf[QueryExecutionException])
+      assert(e.getCause.getCause.isInstanceOf[ParquetDecodingException])
+      assert(e.getCause.getMessage.startsWith(expectedMessage))
+    }
+  }
+
+  test("schema mismatch failure error message for parquet vectorized reader") {
+    withTempPath { dir =>
+      val e = testSchemaMismatch(dir.getCanonicalPath, vectorizedReaderEnabled = true)
+      assert(e.getCause.isInstanceOf[QueryExecutionException])
+      assert(e.getCause.getCause.isInstanceOf[SchemaColumnConvertNotSupportedException])
+
+      // Check if the physical type is reporting correctly
+      val errMsg = e.getCause.getMessage
+      assert(errMsg.startsWith("Parquet column cannot be converted in file"))
+      val file = errMsg.substring("Parquet column cannot be converted in file ".length,
+        errMsg.indexOf(". "))
+      val col = spark.read.parquet(file).schema.fields.filter(_.name.equals("a"))
+      assert(col.length == 1)
+      if (col(0).dataType == StringType) {
+        assert(errMsg.contains("Column: [a], Expected: IntegerType, Found: BINARY"))
+      } else {
+        assert(errMsg.endsWith("Column: [a], Expected: StringType, Found: INT32"))
+      }
+    }
+  }
+
   // =======================================================
   // Tests for converting Parquet LIST to Catalyst ArrayType
   // =======================================================

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@`
`22`	`22`	`import static org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.createRLEIterator;`
`23`	`23`
`24`	`24`	`import java.io.IOException;`
	`25`	`+import java.util.Arrays;`
`25`	`26`	`import java.util.TimeZone;`
`26`	`27`	`import org.apache.parquet.bytes.BytesUtils;`
`27`	`28`	`import org.apache.parquet.column.ColumnDescriptor;`
`@@ -37,6 +38,7 @@`
`37`	`38`	`import org.apache.parquet.schema.OriginalType;`
`38`	`39`	`import org.apache.parquet.schema.PrimitiveType;`
`39`	`40`	`import org.apache.spark.sql.catalyst.util.DateTimeUtils;`
	`41`	`+import org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException;`
`40`	`42`	`import org.apache.spark.sql.execution.vectorized.WritableColumnVector;`
`41`	`43`	`import org.apache.spark.sql.types.DataTypes;`
`42`	`44`	`import org.apache.spark.sql.types.DecimalType;`
`@@ -233,6 +235,18 @@ private boolean shouldConvertTimestamps() {`
`233`	`235`	`return convertTz != null && !convertTz.equals(UTC);`
`234`	`236`	`}`
`235`	`237`
	`238`	`+ /**`
	`239`	`+ * Helper function to construct exception for parquet schema mismatch.`
	`240`	`+ */`
	`241`	`+ private SchemaColumnConvertNotSupportedException constructConvertNotSupportedException(`
	`242`	`+ ColumnDescriptor descriptor,`
	`243`	`+ WritableColumnVector column) {`
	`244`	`+ return new SchemaColumnConvertNotSupportedException(`
	`245`	`+ Arrays.toString(descriptor.getPath()),`
	`246`	`+ descriptor.getType().toString(),`
	`247`	`+ column.dataType().toString());`
	`248`	`+ }`
	`249`	`+`
`236`	`250`	`/**`
`237`	`251`	* Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`.
`238`	`252`	`*/`
`@@ -263,7 +277,7 @@ private void decodeDictionaryIds(`
`263`	`277`	`}`
`264`	`278`	`}`
`265`	`279`	`} else {`
`266`		`- throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());`
	`280`	`+ throw constructConvertNotSupportedException(descriptor, column);`
`267`	`281`	`}`
`268`	`282`	`break;`
`269`	`283`
`@@ -284,7 +298,7 @@ private void decodeDictionaryIds(`
`284`	`298`	`}`
`285`	`299`	`}`
`286`	`300`	`} else {`
`287`		`- throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());`
	`301`	`+ throw constructConvertNotSupportedException(descriptor, column);`
`288`	`302`	`}`
`289`	`303`	`break;`
`290`	`304`
`@@ -323,7 +337,7 @@ private void decodeDictionaryIds(`
`323`	`337`	`}`
`324`	`338`	`}`
`325`	`339`	`} else {`
`326`		`- throw new UnsupportedOperationException();`
	`340`	`+ throw constructConvertNotSupportedException(descriptor, column);`
`327`	`341`	`}`
`328`	`342`	`break;`
`329`	`343`	`case BINARY:`
`@@ -362,7 +376,7 @@ private void decodeDictionaryIds(`
`362`	`376`	`}`
`363`	`377`	`}`
`364`	`378`	`} else {`
`365`		`- throw new UnsupportedOperationException();`
	`379`	`+ throw constructConvertNotSupportedException(descriptor, column);`
`366`	`380`	`}`
`367`	`381`	`break;`
`368`	`382`
`@@ -377,7 +391,9 @@ private void decodeDictionaryIds(`
`377`	`391`	`*/`
`378`	`392`
`379`	`393`	`private void readBooleanBatch(int rowId, int num, WritableColumnVector column) {`
`380`		`- assert(column.dataType() == DataTypes.BooleanType);`
	`394`	`+ if (column.dataType() != DataTypes.BooleanType) {`
	`395`	`+ throw constructConvertNotSupportedException(descriptor, column);`
	`396`	`+ }`
`381`	`397`	`defColumn.readBooleans(`
`382`	`398`	`num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);`
`383`	`399`	`}`
`@@ -396,7 +412,7 @@ private void readIntBatch(int rowId, int num, WritableColumnVector column) {`
`396`	`412`	`defColumn.readShorts(`
`397`	`413`	`num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);`
`398`	`414`	`} else {`
`399`		`- throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());`
	`415`	`+ throw constructConvertNotSupportedException(descriptor, column);`
`400`	`416`	`}`
`401`	`417`	`}`
`402`	`418`
`@@ -416,7 +432,7 @@ private void readLongBatch(int rowId, int num, WritableColumnVector column) {`
`416`	`432`	`}`
`417`	`433`	`}`
`418`	`434`	`} else {`
`419`		`- throw new UnsupportedOperationException("Unsupported conversion to: " + column.dataType());`
	`435`	`+ throw constructConvertNotSupportedException(descriptor, column);`
`420`	`436`	`}`
`421`	`437`	`}`
`422`	`438`
`@@ -427,7 +443,7 @@ private void readFloatBatch(int rowId, int num, WritableColumnVector column) {`
`427`	`443`	`defColumn.readFloats(`
`428`	`444`	`num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);`
`429`	`445`	`} else {`
`430`		`- throw new UnsupportedOperationException("Unsupported conversion to: " + column.dataType());`
	`446`	`+ throw constructConvertNotSupportedException(descriptor, column);`
`431`	`447`	`}`
`432`	`448`	`}`
`433`	`449`
`@@ -438,7 +454,7 @@ private void readDoubleBatch(int rowId, int num, WritableColumnVector column) {`
`438`	`454`	`defColumn.readDoubles(`
`439`	`455`	`num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);`
`440`	`456`	`} else {`
`441`		`- throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());`
	`457`	`+ throw constructConvertNotSupportedException(descriptor, column);`
`442`	`458`	`}`
`443`	`459`	`}`
`444`	`460`
`@@ -473,7 +489,7 @@ private void readBinaryBatch(int rowId, int num, WritableColumnVector column) {`
`473`	`489`	`}`
`474`	`490`	`}`
`475`	`491`	`} else {`
`476`		`- throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());`
	`492`	`+ throw constructConvertNotSupportedException(descriptor, column);`
`477`	`493`	`}`
`478`	`494`	`}`
`479`	`495`
`@@ -512,7 +528,7 @@ private void readFixedLenByteArrayBatch(`
`512`	`528`	`}`
`513`	`529`	`}`
`514`	`530`	`} else {`
`515`		`- throw new UnsupportedOperationException("Unimplemented type: " + column.dataType());`
	`531`	`+ throw constructConvertNotSupportedException(descriptor, column);`
`516`	`532`	`}`
`517`	`533`	`}`
`518`	`534`