Skip to content

Commit fa09d91

Browse files
gengliangwanggatorsmile
authored andcommitted
[SPARK-24919][BUILD] New linter rule for sparkContext.hadoopConfiguration
## What changes were proposed in this pull request? In most cases, we should use `spark.sessionState.newHadoopConf()` instead of `sparkContext.hadoopConfiguration`, so that the hadoop configurations specified in Spark session configuration will come into effect. Add a rule matching `spark.sparkContext.hadoopConfiguration` or `spark.sqlContext.sparkContext.hadoopConfiguration` to prevent the usage. ## How was this patch tested? Unit test Author: Gengliang Wang <[email protected]> Closes apache#21873 from gengliangwang/linterRule.
1 parent 2c82745 commit fa09d91

File tree

8 files changed

+45
-37
lines changed

8 files changed

+45
-37
lines changed

external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -638,12 +638,8 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
638638
intercept[FileNotFoundException] {
639639
withTempPath { dir =>
640640
FileUtils.touch(new File(dir, "test"))
641-
val hadoopConf = spark.sqlContext.sparkContext.hadoopConfiguration
642-
try {
643-
hadoopConf.set(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, "true")
641+
withSQLConf(AvroFileFormat.IgnoreFilesWithoutExtensionProperty -> "true") {
644642
spark.read.format("avro").load(dir.toString)
645-
} finally {
646-
hadoopConf.unset(AvroFileFormat.IgnoreFilesWithoutExtensionProperty)
647643
}
648644
}
649645
}
@@ -717,15 +713,10 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
717713

718714
Files.createFile(new File(tempSaveDir, "non-avro").toPath)
719715

720-
val hadoopConf = spark.sqlContext.sparkContext.hadoopConfiguration
721-
val count = try {
722-
hadoopConf.set(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, "true")
716+
withSQLConf(AvroFileFormat.IgnoreFilesWithoutExtensionProperty -> "true") {
723717
val newDf = spark.read.format("avro").load(tempSaveDir)
724-
newDf.count()
725-
} finally {
726-
hadoopConf.unset(AvroFileFormat.IgnoreFilesWithoutExtensionProperty)
718+
assert(newDf.count() == 8)
727719
}
728-
assert(count == 8)
729720
}
730721
}
731722

@@ -888,20 +879,15 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
888879
Paths.get(new URL(episodesAvro).toURI),
889880
Paths.get(dir.getCanonicalPath, "episodes"))
890881

891-
val hadoopConf = spark.sqlContext.sparkContext.hadoopConfiguration
892-
val count = try {
893-
hadoopConf.set(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, "true")
882+
val hadoopConf = spark.sessionState.newHadoopConf()
883+
withSQLConf(AvroFileFormat.IgnoreFilesWithoutExtensionProperty -> "true") {
894884
val newDf = spark
895885
.read
896886
.option("ignoreExtension", "true")
897887
.format("avro")
898888
.load(s"${dir.getCanonicalPath}/episodes")
899-
newDf.count()
900-
} finally {
901-
hadoopConf.unset(AvroFileFormat.IgnoreFilesWithoutExtensionProperty)
889+
assert(newDf.count() == 8)
902890
}
903-
904-
assert(count == 8)
905891
}
906892
}
907893
}

mllib/src/main/scala/org/apache/spark/ml/image/HadoopUtils.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ private object RecursiveFlag {
3838
*/
3939
def withRecursiveFlag[T](value: Boolean, spark: SparkSession)(f: => T): T = {
4040
val flagName = FileInputFormat.INPUT_DIR_RECURSIVE
41+
// scalastyle:off hadoopconfiguration
4142
val hadoopConf = spark.sparkContext.hadoopConfiguration
43+
// scalastyle:on hadoopconfiguration
4244
val old = Option(hadoopConf.get(flagName))
4345
hadoopConf.set(flagName, value.toString)
4446
try f finally {
@@ -98,7 +100,9 @@ private object SamplePathFilter {
98100
val sampleImages = sampleRatio < 1
99101
if (sampleImages) {
100102
val flagName = FileInputFormat.PATHFILTER_CLASS
103+
// scalastyle:off hadoopconfiguration
101104
val hadoopConf = spark.sparkContext.hadoopConfiguration
105+
// scalastyle:on hadoopconfiguration
102106
val old = Option(hadoopConf.getClass(flagName, null))
103107
hadoopConf.setDouble(SamplePathFilter.ratioParam, sampleRatio)
104108
hadoopConf.setLong(SamplePathFilter.seedParam, seed)

mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ class LDASuite extends MLTest with DefaultReadWriteTest {
285285
// There should be 1 checkpoint remaining.
286286
assert(model.getCheckpointFiles.length === 1)
287287
val checkpointFile = new Path(model.getCheckpointFiles.head)
288-
val fs = checkpointFile.getFileSystem(spark.sparkContext.hadoopConfiguration)
288+
val fs = checkpointFile.getFileSystem(spark.sessionState.newHadoopConf())
289289
assert(fs.exists(checkpointFile))
290290
model.deleteCheckpointFiles()
291291
assert(model.getCheckpointFiles.isEmpty)

scalastyle-config.xml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,19 @@ This file is divided into 3 sections:
150150
// scalastyle:on println]]></customMessage>
151151
</check>
152152

153+
<check customId="hadoopconfiguration" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
154+
<parameters><parameter name="regex">spark(.sqlContext)?.sparkContext.hadoopConfiguration</parameter></parameters>
155+
<customMessage><![CDATA[
156+
Are you sure that you want to use sparkContext.hadoopConfiguration? In most cases, you should use
157+
spark.sessionState.newHadoopConf() instead, so that the hadoop configurations specified in Spark session
158+
configuration will come into effect.
159+
If you must use sparkContext.hadoopConfiguration, wrap the code block with
160+
// scalastyle:off hadoopconfiguration
161+
spark.sparkContext.hadoopConfiguration...
162+
// scalastyle:on hadoopconfiguration
163+
]]></customMessage>
164+
</check>
165+
153166
<check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
154167
<parameters><parameter name="regex">@VisibleForTesting</parameter></parameters>
155168
<customMessage><![CDATA[

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFileLinesReaderSuite.scala

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class HadoopFileLinesReaderSuite extends SharedSQLContext {
3838

3939
val lines = ranges.map { case (start, length) =>
4040
val file = PartitionedFile(InternalRow.empty, path.getCanonicalPath, start, length)
41-
val hadoopConf = conf.getOrElse(spark.sparkContext.hadoopConfiguration)
41+
val hadoopConf = conf.getOrElse(spark.sessionState.newHadoopConf())
4242
val reader = new HadoopFileLinesReader(file, delimOpt, hadoopConf)
4343

4444
reader.map(_.toString)
@@ -111,20 +111,20 @@ class HadoopFileLinesReaderSuite extends SharedSQLContext {
111111
}
112112

113113
test("io.file.buffer.size is less than line length") {
114-
val conf = spark.sparkContext.hadoopConfiguration
115-
conf.set("io.file.buffer.size", "2")
116-
withTempPath { path =>
117-
val lines = getLines(path, text = "abcdef\n123456", ranges = Seq((4, 4), (8, 5)))
118-
assert(lines == Seq("123456"))
114+
withSQLConf("io.file.buffer.size" -> "2") {
115+
withTempPath { path =>
116+
val lines = getLines(path, text = "abcdef\n123456", ranges = Seq((4, 4), (8, 5)))
117+
assert(lines == Seq("123456"))
118+
}
119119
}
120120
}
121121

122122
test("line cannot be longer than line.maxlength") {
123-
val conf = spark.sparkContext.hadoopConfiguration
124-
conf.set("mapreduce.input.linerecordreader.line.maxlength", "5")
125-
withTempPath { path =>
126-
val lines = getLines(path, text = "abcdef\n1234", ranges = Seq((0, 15)))
127-
assert(lines == Seq("1234"))
123+
withSQLConf("mapreduce.input.linerecordreader.line.maxlength" -> "5") {
124+
withTempPath { path =>
125+
val lines = getLines(path, text = "abcdef\n1234", ranges = Seq((0, 15)))
126+
assert(lines == Seq("1234"))
127+
}
128128
}
129129
}
130130

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -783,7 +783,7 @@ class HiveDDLSuite
783783
val part1 = Map("a" -> "1", "b" -> "5")
784784
val part2 = Map("a" -> "2", "b" -> "6")
785785
val root = new Path(catalog.getTableMetadata(tableIdent).location)
786-
val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
786+
val fs = root.getFileSystem(spark.sessionState.newHadoopConf())
787787
// valid
788788
fs.mkdirs(new Path(new Path(root, "a=1"), "b=5"))
789789
fs.createNewFile(new Path(new Path(root, "a=1/b=5"), "a.csv")) // file

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1177,13 +1177,18 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd
11771177
assert(spark.table("with_parts").filter($"p" === 2).collect().head == Row(1, 2))
11781178
}
11791179

1180-
val originalValue = spark.sparkContext.hadoopConfiguration.get(modeConfKey, "nonstrict")
1180+
// Turn off style check since the following test is to modify hadoop configuration on purpose.
1181+
// scalastyle:off hadoopconfiguration
1182+
val hadoopConf = spark.sparkContext.hadoopConfiguration
1183+
// scalastyle:on hadoopconfiguration
1184+
1185+
val originalValue = hadoopConf.get(modeConfKey, "nonstrict")
11811186
try {
1182-
spark.sparkContext.hadoopConfiguration.set(modeConfKey, "nonstrict")
1187+
hadoopConf.set(modeConfKey, "nonstrict")
11831188
sql("INSERT OVERWRITE TABLE with_parts partition(p) select 3, 4")
11841189
assert(spark.table("with_parts").filter($"p" === 4).collect().head == Row(3, 4))
11851190
} finally {
1186-
spark.sparkContext.hadoopConfiguration.set(modeConfKey, originalValue)
1191+
hadoopConf.set(modeConfKey, originalValue)
11871192
}
11881193
}
11891194
}

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2053,7 +2053,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
20532053
val deleteOnExitField = classOf[FileSystem].getDeclaredField("deleteOnExit")
20542054
deleteOnExitField.setAccessible(true)
20552055

2056-
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
2056+
val fs = FileSystem.get(spark.sessionState.newHadoopConf())
20572057
val setOfPath = deleteOnExitField.get(fs).asInstanceOf[Set[Path]]
20582058

20592059
val testData = sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)).toDF()

0 commit comments

Comments
 (0)