Fix parquet split calculation to avoid O(file*block) lookups (apache-spark-on-k8s#380)

pwoody · Robert Kruszewski · commit 93e4fbb293c1 · 2018-06-24T22:32:13.000+01:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileSplitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileSplitter.scala
@@ -40,14 +40,14 @@ import org.apache.spark.util.ThreadUtils
 abstract class ParquetFileSplitter {
   def buildSplitter(filters: Seq[Filter]): (FileStatus => Seq[FileSplit])
 
-  def singleFileSplit(stat: FileStatus): Seq[FileSplit] = {
-    Seq(new FileSplit(stat.getPath, 0, stat.getLen, Array.empty))
+  def singleFileSplit(path: Path, length: Long): Seq[FileSplit] = {
+    Seq(new FileSplit(path, 0, length, Array.empty))
   }
 }
 
 object ParquetDefaultFileSplitter extends ParquetFileSplitter {
   override def buildSplitter(filters: Seq[Filter]): (FileStatus => Seq[FileSplit]) = {
-    stat => singleFileSplit(stat)
+    stat => singleFileSplit(stat.getPath, stat.getLen)
   }
 }
 
@@ -84,18 +84,20 @@ class ParquetMetadataFileSplitter(
       (applied, unapplied, filteredBlocks)
     }
 
+    // Group eligible splits by file Path.
     val eligible = applyParquetFilter(unapplied, filteredBlocks).map { bmd =>
       val blockPath = new Path(root, bmd.getPath)
       new FileSplit(blockPath, bmd.getStartingPos, bmd.getCompressedSize, Array.empty)
-    }
+    }.groupBy(_.getPath)
 
     val statFilter: (FileStatus => Seq[FileSplit]) = { stat =>
-      if (referencedFiles.contains(stat.getPath)) {
-        eligible.filter(_.getPath == stat.getPath)
+      val filePath = stat.getPath
+      if (referencedFiles.contains(filePath)) {
+        eligible.getOrElse(filePath, Nil)
       } else {
         log.warn(s"Found _metadata file for $root," +
-          s" but no entries for blocks in ${stat.getPath}. Retaining whole file.")
-        singleFileSplit(stat)
+          s" but no entries for blocks in ${filePath}. Retaining whole file.")
+        singleFileSplit(filePath, stat.getLen)
       }
     }
     statFilter

Original file line number	Diff line number	Diff line change
`@@ -40,14 +40,14 @@ import org.apache.spark.util.ThreadUtils`
`40`	`40`	`abstract class ParquetFileSplitter {`
`41`	`41`	`def buildSplitter(filters: Seq[Filter]): (FileStatus => Seq[FileSplit])`
`42`	`42`
`43`		`- def singleFileSplit(stat: FileStatus): Seq[FileSplit] = {`
`44`		`- Seq(new FileSplit(stat.getPath, 0, stat.getLen, Array.empty))`
	`43`	`+ def singleFileSplit(path: Path, length: Long): Seq[FileSplit] = {`
	`44`	`+ Seq(new FileSplit(path, 0, length, Array.empty))`
`45`	`45`	`}`
`46`	`46`	`}`
`47`	`47`
`48`	`48`	`object ParquetDefaultFileSplitter extends ParquetFileSplitter {`
`49`	`49`	`override def buildSplitter(filters: Seq[Filter]): (FileStatus => Seq[FileSplit]) = {`
`50`		`- stat => singleFileSplit(stat)`
	`50`	`+ stat => singleFileSplit(stat.getPath, stat.getLen)`
`51`	`51`	`}`
`52`	`52`	`}`
`53`	`53`
`@@ -84,18 +84,20 @@ class ParquetMetadataFileSplitter(`
`84`	`84`	`(applied, unapplied, filteredBlocks)`
`85`	`85`	`}`
`86`	`86`
	`87`	`+ // Group eligible splits by file Path.`
`87`	`88`	`val eligible = applyParquetFilter(unapplied, filteredBlocks).map { bmd =>`
`88`	`89`	`val blockPath = new Path(root, bmd.getPath)`
`89`	`90`	`new FileSplit(blockPath, bmd.getStartingPos, bmd.getCompressedSize, Array.empty)`
`90`		`- }`
	`91`	`+ }.groupBy(_.getPath)`
`91`	`92`
`92`	`93`	`val statFilter: (FileStatus => Seq[FileSplit]) = { stat =>`
`93`		`- if (referencedFiles.contains(stat.getPath)) {`
`94`		`- eligible.filter(_.getPath == stat.getPath)`
	`94`	`+ val filePath = stat.getPath`
	`95`	`+ if (referencedFiles.contains(filePath)) {`
	`96`	`+ eligible.getOrElse(filePath, Nil)`
`95`	`97`	`} else {`
`96`	`98`	`log.warn(s"Found _metadata file for $root," +`
`97`		`- s" but no entries for blocks in ${stat.getPath}. Retaining whole file.")`
`98`		`- singleFileSplit(stat)`
	`99`	`+ s" but no entries for blocks in ${filePath}. Retaining whole file.")`
	`100`	`+ singleFileSplit(filePath, stat.getLen)`
`99`	`101`	`}`
`100`	`102`	`}`
`101`	`103`	`statFilter`