Skip to content
This repository was archived by the owner on Jan 9, 2020. It is now read-only.

Commit 9d29808

Browse files
maropugatorsmile
authored andcommitted
[SPARK-21144][SQL] Print a warning if the data schema and partition schema have the duplicate columns
## What changes were proposed in this pull request? The current master outputs unexpected results when the data schema and partition schema have the duplicate columns: ``` withTempPath { dir => val basePath = dir.getCanonicalPath spark.range(0, 3).toDF("foo").write.parquet(new Path(basePath, "foo=1").toString) spark.range(0, 3).toDF("foo").write.parquet(new Path(basePath, "foo=a").toString) spark.read.parquet(basePath).show() } +---+ |foo| +---+ | 1| | 1| | a| | a| | 1| | a| +---+ ``` This patch added code to print a warning when the duplication found. ## How was this patch tested? Manually checked. Author: Takeshi Yamamuro <[email protected]> Closes apache#18375 from maropu/SPARK-21144-3. (cherry picked from commit f3dea60) Signed-off-by: gatorsmile <[email protected]>
1 parent b6749ba commit 9d29808

File tree

2 files changed

+59
-0
lines changed

2 files changed

+59
-0
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.util
19+
20+
import org.apache.spark.internal.Logging
21+
22+
23+
/**
24+
* Utils for handling schemas.
25+
*
26+
* TODO: Merge this file with [[org.apache.spark.ml.util.SchemaUtils]].
27+
*/
28+
private[spark] object SchemaUtils extends Logging {
29+
30+
/**
31+
* Checks if input column names have duplicate identifiers. Prints a warning message if
32+
* the duplication exists.
33+
*
34+
* @param columnNames column names to check
35+
* @param colType column type name, used in a warning message
36+
* @param caseSensitiveAnalysis whether duplication checks should be case sensitive or not
37+
*/
38+
def checkColumnNameDuplication(
39+
columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = {
40+
val names = if (caseSensitiveAnalysis) {
41+
columnNames
42+
} else {
43+
columnNames.map(_.toLowerCase)
44+
}
45+
if (names.distinct.length != names.length) {
46+
val duplicateColumns = names.groupBy(identity).collect {
47+
case (x, ys) if ys.length > 1 => s"`$x`"
48+
}
49+
logWarning(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}. " +
50+
"You might need to assign different column names.")
51+
}
52+
}
53+
}

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.streaming._
3939
import org.apache.spark.sql.sources._
4040
import org.apache.spark.sql.streaming.OutputMode
4141
import org.apache.spark.sql.types.{CalendarIntervalType, StructType}
42+
import org.apache.spark.sql.util.SchemaUtils
4243
import org.apache.spark.util.Utils
4344

4445
/**
@@ -181,6 +182,11 @@ case class DataSource(
181182
throw new AnalysisException(
182183
s"Unable to infer schema for $format. It must be specified manually.")
183184
}
185+
186+
SchemaUtils.checkColumnNameDuplication(
187+
(dataSchema ++ partitionSchema).map(_.name), "in the data schema and the partition schema",
188+
sparkSession.sessionState.conf.caseSensitiveAnalysis)
189+
184190
(dataSchema, partitionSchema)
185191
}
186192

0 commit comments

Comments
 (0)