@@ -40,14 +40,14 @@ import org.apache.spark.util.ThreadUtils
40
40
abstract class ParquetFileSplitter {
41
41
def buildSplitter (filters : Seq [Filter ]): (FileStatus => Seq [FileSplit ])
42
42
43
- def singleFileSplit (stat : FileStatus ): Seq [FileSplit ] = {
44
- Seq (new FileSplit (stat.getPath , 0 , stat.getLen , Array .empty))
43
+ def singleFileSplit (path : Path , length : Long ): Seq [FileSplit ] = {
44
+ Seq (new FileSplit (path , 0 , length , Array .empty))
45
45
}
46
46
}
47
47
48
48
object ParquetDefaultFileSplitter extends ParquetFileSplitter {
49
49
override def buildSplitter (filters : Seq [Filter ]): (FileStatus => Seq [FileSplit ]) = {
50
- stat => singleFileSplit(stat)
50
+ stat => singleFileSplit(stat.getPath, stat.getLen )
51
51
}
52
52
}
53
53
@@ -84,18 +84,20 @@ class ParquetMetadataFileSplitter(
84
84
(applied, unapplied, filteredBlocks)
85
85
}
86
86
87
+ // Group eligible splits by file Path.
87
88
val eligible = applyParquetFilter(unapplied, filteredBlocks).map { bmd =>
88
89
val blockPath = new Path (root, bmd.getPath)
89
90
new FileSplit (blockPath, bmd.getStartingPos, bmd.getCompressedSize, Array .empty)
90
- }
91
+ }.groupBy(_.getPath)
91
92
92
93
val statFilter : (FileStatus => Seq [FileSplit ]) = { stat =>
93
- if (referencedFiles.contains(stat.getPath)) {
94
- eligible.filter(_.getPath == stat.getPath)
94
+ val filePath = stat.getPath
95
+ if (referencedFiles.contains(filePath)) {
96
+ eligible.getOrElse(filePath, Nil )
95
97
} else {
96
98
log.warn(s " Found _metadata file for $root, " +
97
- s " but no entries for blocks in ${stat.getPath }. Retaining whole file. " )
98
- singleFileSplit(stat)
99
+ s " but no entries for blocks in ${filePath }. Retaining whole file. " )
100
+ singleFileSplit(filePath, stat.getLen )
99
101
}
100
102
}
101
103
statFilter
0 commit comments