[SPARK-24919][BUILD] New linter rule for sparkContext.hadoopConfiguration

gengliangwang · gatorsmile · commit fa09d91925c0 · 2018-07-26T16:50:59.000-07:00
## What changes were proposed in this pull request? In most cases, we should use `spark.sessionState.newHadoopConf()` instead of `sparkContext.hadoopConfiguration`, so that the hadoop configurations specified in Spark session configuration will come into effect. Add a rule matching `spark.sparkContext.hadoopConfiguration` or `spark.sqlContext.sparkContext.hadoopConfiguration` to prevent the usage. ## How was this patch tested? Unit test Author: Gengliang Wang <gengliang.wang@databricks.com> Closes apache#21873 from gengliangwang/linterRule.
diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
@@ -638,12 +638,8 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     intercept[FileNotFoundException] {
       withTempPath { dir =>
         FileUtils.touch(new File(dir, "test"))
-        val hadoopConf = spark.sqlContext.sparkContext.hadoopConfiguration
-        try {
-          hadoopConf.set(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, "true")
+        withSQLConf(AvroFileFormat.IgnoreFilesWithoutExtensionProperty -> "true") {
           spark.read.format("avro").load(dir.toString)
-        } finally {
-          hadoopConf.unset(AvroFileFormat.IgnoreFilesWithoutExtensionProperty)
         }
       }
     }
@@ -717,15 +713,10 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
 
       Files.createFile(new File(tempSaveDir, "non-avro").toPath)
 
-      val hadoopConf = spark.sqlContext.sparkContext.hadoopConfiguration
-      val count = try {
-        hadoopConf.set(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, "true")
+      withSQLConf(AvroFileFormat.IgnoreFilesWithoutExtensionProperty -> "true") {
         val newDf = spark.read.format("avro").load(tempSaveDir)
-        newDf.count()
-      } finally {
-        hadoopConf.unset(AvroFileFormat.IgnoreFilesWithoutExtensionProperty)
+        assert(newDf.count() == 8)
       }
-      assert(count == 8)
     }
   }
 
@@ -888,20 +879,15 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
         Paths.get(new URL(episodesAvro).toURI),
         Paths.get(dir.getCanonicalPath, "episodes"))
 
-      val hadoopConf = spark.sqlContext.sparkContext.hadoopConfiguration
-      val count = try {
-        hadoopConf.set(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, "true")
+      val hadoopConf = spark.sessionState.newHadoopConf()
+      withSQLConf(AvroFileFormat.IgnoreFilesWithoutExtensionProperty -> "true") {
         val newDf = spark
           .read
           .option("ignoreExtension", "true")
           .format("avro")
           .load(s"${dir.getCanonicalPath}/episodes")
-        newDf.count()
-      } finally {
-        hadoopConf.unset(AvroFileFormat.IgnoreFilesWithoutExtensionProperty)
+        assert(newDf.count() == 8)
       }
-
-      assert(count == 8)
     }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/image/HadoopUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/image/HadoopUtils.scala
@@ -38,7 +38,9 @@ private object RecursiveFlag {
    */
   def withRecursiveFlag[T](value: Boolean, spark: SparkSession)(f: => T): T = {
     val flagName = FileInputFormat.INPUT_DIR_RECURSIVE
+    // scalastyle:off hadoopconfiguration
     val hadoopConf = spark.sparkContext.hadoopConfiguration
+    // scalastyle:on hadoopconfiguration
     val old = Option(hadoopConf.get(flagName))
     hadoopConf.set(flagName, value.toString)
     try f finally {
@@ -98,7 +100,9 @@ private object SamplePathFilter {
     val sampleImages = sampleRatio < 1
     if (sampleImages) {
       val flagName = FileInputFormat.PATHFILTER_CLASS
+      // scalastyle:off hadoopconfiguration
       val hadoopConf = spark.sparkContext.hadoopConfiguration
+      // scalastyle:on hadoopconfiguration
       val old = Option(hadoopConf.getClass(flagName, null))
       hadoopConf.setDouble(SamplePathFilter.ratioParam, sampleRatio)
       hadoopConf.setLong(SamplePathFilter.seedParam, seed)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
@@ -285,7 +285,7 @@ class LDASuite extends MLTest with DefaultReadWriteTest {
     // There should be 1 checkpoint remaining.
     assert(model.getCheckpointFiles.length === 1)
     val checkpointFile = new Path(model.getCheckpointFiles.head)
-    val fs = checkpointFile.getFileSystem(spark.sparkContext.hadoopConfiguration)
+    val fs = checkpointFile.getFileSystem(spark.sessionState.newHadoopConf())
     assert(fs.exists(checkpointFile))
     model.deleteCheckpointFiles()
     assert(model.getCheckpointFiles.isEmpty)
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
@@ -150,6 +150,19 @@ This file is divided into 3 sections:
       // scalastyle:on println]]></customMessage>
   </check>
 
+  <check customId="hadoopconfiguration" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
+    <parameters><parameter name="regex">spark(.sqlContext)?.sparkContext.hadoopConfiguration</parameter></parameters>
+    <customMessage><![CDATA[
+      Are you sure that you want to use sparkContext.hadoopConfiguration? In most cases, you should use
+      spark.sessionState.newHadoopConf() instead, so that the hadoop configurations specified in Spark session
+      configuration will come into effect.
+      If you must use sparkContext.hadoopConfiguration, wrap the code block with
+      // scalastyle:off hadoopconfiguration
+      spark.sparkContext.hadoopConfiguration...
+      // scalastyle:on hadoopconfiguration
+    ]]></customMessage>
+  </check>
+
   <check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
     <parameters><parameter name="regex">@VisibleForTesting</parameter></parameters>
     <customMessage><![CDATA[
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFileLinesReaderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFileLinesReaderSuite.scala
@@ -38,7 +38,7 @@ class HadoopFileLinesReaderSuite extends SharedSQLContext {
 
     val lines = ranges.map { case (start, length) =>
       val file = PartitionedFile(InternalRow.empty, path.getCanonicalPath, start, length)
-      val hadoopConf = conf.getOrElse(spark.sparkContext.hadoopConfiguration)
+      val hadoopConf = conf.getOrElse(spark.sessionState.newHadoopConf())
       val reader = new HadoopFileLinesReader(file, delimOpt, hadoopConf)
 
       reader.map(_.toString)
@@ -111,20 +111,20 @@ class HadoopFileLinesReaderSuite extends SharedSQLContext {
   }
 
   test("io.file.buffer.size is less than line length") {
-    val conf = spark.sparkContext.hadoopConfiguration
-    conf.set("io.file.buffer.size", "2")
-    withTempPath { path =>
-      val lines = getLines(path, text = "abcdef\n123456", ranges = Seq((4, 4), (8, 5)))
-      assert(lines == Seq("123456"))
+    withSQLConf("io.file.buffer.size" -> "2") {
+      withTempPath { path =>
+        val lines = getLines(path, text = "abcdef\n123456", ranges = Seq((4, 4), (8, 5)))
+        assert(lines == Seq("123456"))
+      }
     }
   }
 
   test("line cannot be longer than line.maxlength") {
-    val conf = spark.sparkContext.hadoopConfiguration
-    conf.set("mapreduce.input.linerecordreader.line.maxlength", "5")
-    withTempPath { path =>
-      val lines = getLines(path, text = "abcdef\n1234", ranges = Seq((0, 15)))
-      assert(lines == Seq("1234"))
+    withSQLConf("mapreduce.input.linerecordreader.line.maxlength" -> "5") {
+      withTempPath { path =>
+        val lines = getLines(path, text = "abcdef\n1234", ranges = Seq((0, 15)))
+        assert(lines == Seq("1234"))
+      }
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -783,7 +783,7 @@ class HiveDDLSuite
     val part1 = Map("a" -> "1", "b" -> "5")
     val part2 = Map("a" -> "2", "b" -> "6")
     val root = new Path(catalog.getTableMetadata(tableIdent).location)
-    val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
+    val fs = root.getFileSystem(spark.sessionState.newHadoopConf())
     // valid
     fs.mkdirs(new Path(new Path(root, "a=1"), "b=5"))
     fs.createNewFile(new Path(new Path(root, "a=1/b=5"), "a.csv"))  // file
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -1177,13 +1177,18 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd
         assert(spark.table("with_parts").filter($"p" === 2).collect().head == Row(1, 2))
       }
 
-      val originalValue = spark.sparkContext.hadoopConfiguration.get(modeConfKey, "nonstrict")
+      // Turn off style check since the following test is to modify hadoop configuration on purpose.
+      // scalastyle:off hadoopconfiguration
+      val hadoopConf = spark.sparkContext.hadoopConfiguration
+      // scalastyle:on hadoopconfiguration
+
+      val originalValue = hadoopConf.get(modeConfKey, "nonstrict")
       try {
-        spark.sparkContext.hadoopConfiguration.set(modeConfKey, "nonstrict")
+        hadoopConf.set(modeConfKey, "nonstrict")
         sql("INSERT OVERWRITE TABLE with_parts partition(p) select 3, 4")
         assert(spark.table("with_parts").filter($"p" === 4).collect().head == Row(3, 4))
       } finally {
-        spark.sparkContext.hadoopConfiguration.set(modeConfKey, originalValue)
+        hadoopConf.set(modeConfKey, originalValue)
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2053,7 +2053,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       val deleteOnExitField = classOf[FileSystem].getDeclaredField("deleteOnExit")
       deleteOnExitField.setAccessible(true)
 
-      val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
+      val fs = FileSystem.get(spark.sessionState.newHadoopConf())
       val setOfPath = deleteOnExitField.get(fs).asInstanceOf[Set[Path]]
 
       val testData = sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)).toDF()

Original file line number	Diff line number	Diff line change
`@@ -638,12 +638,8 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {`
`638`	`638`	`intercept[FileNotFoundException] {`
`639`	`639`	`withTempPath { dir =>`
`640`	`640`	`FileUtils.touch(new File(dir, "test"))`
`641`		`- val hadoopConf = spark.sqlContext.sparkContext.hadoopConfiguration`
`642`		`- try {`
`643`		`- hadoopConf.set(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, "true")`
	`641`	`+ withSQLConf(AvroFileFormat.IgnoreFilesWithoutExtensionProperty -> "true") {`
`644`	`642`	`spark.read.format("avro").load(dir.toString)`
`645`		`- } finally {`
`646`		`- hadoopConf.unset(AvroFileFormat.IgnoreFilesWithoutExtensionProperty)`
`647`	`643`	`}`
`648`	`644`	`}`
`649`	`645`	`}`
`@@ -717,15 +713,10 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {`
`717`	`713`
`718`	`714`	`Files.createFile(new File(tempSaveDir, "non-avro").toPath)`
`719`	`715`
`720`		`- val hadoopConf = spark.sqlContext.sparkContext.hadoopConfiguration`
`721`		`- val count = try {`
`722`		`- hadoopConf.set(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, "true")`
	`716`	`+ withSQLConf(AvroFileFormat.IgnoreFilesWithoutExtensionProperty -> "true") {`
`723`	`717`	`val newDf = spark.read.format("avro").load(tempSaveDir)`
`724`		`- newDf.count()`
`725`		`- } finally {`
`726`		`- hadoopConf.unset(AvroFileFormat.IgnoreFilesWithoutExtensionProperty)`
	`718`	`+ assert(newDf.count() == 8)`
`727`	`719`	`}`
`728`		`- assert(count == 8)`
`729`	`720`	`}`
`730`	`721`	`}`
`731`	`722`
`@@ -888,20 +879,15 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {`
`888`	`879`	`Paths.get(new URL(episodesAvro).toURI),`
`889`	`880`	`Paths.get(dir.getCanonicalPath, "episodes"))`
`890`	`881`
`891`		`- val hadoopConf = spark.sqlContext.sparkContext.hadoopConfiguration`
`892`		`- val count = try {`
`893`		`- hadoopConf.set(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, "true")`
	`882`	`+ val hadoopConf = spark.sessionState.newHadoopConf()`
	`883`	`+ withSQLConf(AvroFileFormat.IgnoreFilesWithoutExtensionProperty -> "true") {`
`894`	`884`	`val newDf = spark`
`895`	`885`	`.read`
`896`	`886`	`.option("ignoreExtension", "true")`
`897`	`887`	`.format("avro")`
`898`	`888`	`.load(s"${dir.getCanonicalPath}/episodes")`
`899`		`- newDf.count()`
`900`		`- } finally {`
`901`		`- hadoopConf.unset(AvroFileFormat.IgnoreFilesWithoutExtensionProperty)`
	`889`	`+ assert(newDf.count() == 8)`
`902`	`890`	`}`
`903`		`-`
`904`		`- assert(count == 8)`
`905`	`891`	`}`
`906`	`892`	`}`
`907`	`893`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1177,13 +1177,18 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd`
`1177`	`1177`	`assert(spark.table("with_parts").filter($"p" === 2).collect().head == Row(1, 2))`
`1178`	`1178`	`}`
`1179`	`1179`
`1180`		`- val originalValue = spark.sparkContext.hadoopConfiguration.get(modeConfKey, "nonstrict")`
	`1180`	`+ // Turn off style check since the following test is to modify hadoop configuration on purpose.`
	`1181`	`+ // scalastyle:off hadoopconfiguration`
	`1182`	`+ val hadoopConf = spark.sparkContext.hadoopConfiguration`
	`1183`	`+ // scalastyle:on hadoopconfiguration`
	`1184`	`+`
	`1185`	`+ val originalValue = hadoopConf.get(modeConfKey, "nonstrict")`
`1181`	`1186`	`try {`
`1182`		`- spark.sparkContext.hadoopConfiguration.set(modeConfKey, "nonstrict")`
	`1187`	`+ hadoopConf.set(modeConfKey, "nonstrict")`
`1183`	`1188`	`sql("INSERT OVERWRITE TABLE with_parts partition(p) select 3, 4")`
`1184`	`1189`	`assert(spark.table("with_parts").filter($"p" === 4).collect().head == Row(3, 4))`
`1185`	`1190`	`} finally {`
`1186`		`- spark.sparkContext.hadoopConfiguration.set(modeConfKey, originalValue)`
	`1191`	`+ hadoopConf.set(modeConfKey, originalValue)`
`1187`	`1192`	`}`
`1188`	`1193`	`}`
`1189`	`1194`	`}`