Clone hadoopConf and use (#582)

sandeep-katta0102 · sandeep-katta · web-flow · commit 1e25d7b80b7e · 2022-06-02T07:33:28.000-05:00
Co-authored-by: sandeep katta &lt;sandeep.katta2007@gmail.com&gt;
diff --git a/src/main/scala/com/databricks/spark/xml/util/XmlFile.scala b/src/main/scala/com/databricks/spark/xml/util/XmlFile.scala
@@ -23,6 +23,7 @@ import scala.collection.Map
 
 import com.databricks.spark.xml.parsers.StaxXmlGenerator
 import com.sun.xml.txw2.output.IndentingXMLStreamWriter
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io.{Text, LongWritable}
 
 import org.apache.spark.rdd.RDD
@@ -40,13 +41,15 @@ private[xml] object XmlFile {
       rowTag: String): RDD[String] = {
     // This just checks the charset's validity early, to keep behavior
     Charset.forName(charset)
-    context.hadoopConfiguration.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>")
-    context.hadoopConfiguration.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>")
-    context.hadoopConfiguration.set(XmlInputFormat.ENCODING_KEY, charset)
+    val config = new Configuration(context.hadoopConfiguration)
+    config.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>")
+    config.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>")
+    config.set(XmlInputFormat.ENCODING_KEY, charset)
     context.newAPIHadoopFile(location,
       classOf[XmlInputFormat],
       classOf[LongWritable],
-      classOf[Text]).map { case (_, text) => text.toString }
+      classOf[Text],
+      config).map { case (_, text) => text.toString }
   }
 
   /**
diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala
@@ -22,6 +22,7 @@ import java.util.TimeZone
 
 import scala.io.Source
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io.{LongWritable, Text}
@@ -1395,6 +1396,41 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
     assert(df.collect()(1).getStruct(0).get(2) === null)
   }
 
+  test("read multiple xml files in parallel") {
+    val failedAgesSet = mutable.Set[Long]()
+    val threads_ages = (1 to 10).map { i =>
+      new Thread {
+        override def run() {
+          val df = spark.read.option("rowTag", "person").format("xml")
+            .load(resDir + "ages.xml")
+          if (df.schema.fields.isEmpty) {
+            failedAgesSet.add(i)
+          }
+        }
+      }
+    }
+
+    val failedBooksSet = mutable.Set[Long]()
+    val threads_books = (11 to 20).map { i =>
+      new Thread {
+        override def run() {
+          val df = spark.read.option("rowTag", "book").format("xml")
+            .load(resDir + "books.xml")
+          if (df.schema.fields.isEmpty) {
+            failedBooksSet.add(i)
+          }
+        }
+      }
+    }
+
+    threads_ages.foreach(_.start())
+    threads_books.foreach(_.start())
+    threads_ages.foreach(_.join())
+    threads_books.foreach(_.join())
+    assert(failedBooksSet.isEmpty)
+    assert(failedAgesSet.isEmpty)
+  }
+
   private def getLines(path: Path): Seq[String] = {
     val source = Source.fromFile(path.toFile)
     try {