|
50 | 50 | import org.apache.iceberg.hadoop.HiddenPathFilter; |
51 | 51 | import org.apache.iceberg.io.BulkDeletionFailureException; |
52 | 52 | import org.apache.iceberg.io.SupportsBulkOperations; |
| 53 | +import org.apache.iceberg.io.SupportsPrefixOperations; |
53 | 54 | import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; |
54 | 55 | import org.apache.iceberg.relocated.com.google.common.base.Preconditions; |
55 | 56 | import org.apache.iceberg.relocated.com.google.common.base.Strings; |
@@ -121,6 +122,7 @@ public class DeleteOrphanFilesSparkAction extends BaseSparkAction<DeleteOrphanFi |
121 | 122 | private Dataset<Row> compareToFileList; |
122 | 123 | private Consumer<String> deleteFunc = null; |
123 | 124 | private ExecutorService deleteExecutorService = null; |
| 125 | + private boolean usePrefixListing = false; |
124 | 126 |
|
125 | 127 | DeleteOrphanFilesSparkAction(SparkSession spark, Table table) { |
126 | 128 | super(spark); |
@@ -206,6 +208,11 @@ public DeleteOrphanFilesSparkAction compareToFileList(Dataset<Row> files) { |
206 | 208 | return this; |
207 | 209 | } |
208 | 210 |
|
| 211 | + public DeleteOrphanFilesSparkAction usePrefixListing(boolean newUsePrefixListing) { |
| 212 | + this.usePrefixListing = newUsePrefixListing; |
| 213 | + return this; |
| 214 | + } |
| 215 | + |
209 | 216 | private Dataset<String> filteredCompareToFileList() { |
210 | 217 | Dataset<Row> files = compareToFileList; |
211 | 218 | if (location != null) { |
@@ -303,39 +310,90 @@ private Dataset<String> listedFileDS() { |
303 | 310 | List<String> subDirs = Lists.newArrayList(); |
304 | 311 | List<String> matchingFiles = Lists.newArrayList(); |
305 | 312 |
|
306 | | - Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp; |
307 | 313 | PathFilter pathFilter = PartitionAwareHiddenPathFilter.forSpecs(table.specs()); |
308 | 314 |
|
309 | | - // list at most MAX_DRIVER_LISTING_DEPTH levels and only dirs that have |
310 | | - // less than MAX_DRIVER_LISTING_DIRECT_SUB_DIRS direct sub dirs on the driver |
311 | | - listDirRecursively( |
312 | | - location, |
313 | | - predicate, |
314 | | - hadoopConf.value(), |
315 | | - MAX_DRIVER_LISTING_DEPTH, |
316 | | - MAX_DRIVER_LISTING_DIRECT_SUB_DIRS, |
317 | | - subDirs, |
318 | | - pathFilter, |
319 | | - matchingFiles); |
320 | | - |
321 | | - JavaRDD<String> matchingFileRDD = sparkContext().parallelize(matchingFiles, 1); |
322 | | - |
323 | | - if (subDirs.isEmpty()) { |
| 315 | + if (usePrefixListing) { |
| 316 | + Preconditions.checkArgument( |
| 317 | + table.io() instanceof SupportsPrefixOperations, |
| 318 | + "Cannot use prefix listing with FileIO {} which does not support prefix operations.", |
| 319 | + table.io()); |
| 320 | + |
| 321 | + Predicate<org.apache.iceberg.io.FileInfo> predicate = |
| 322 | + fileInfo -> fileInfo.createdAtMillis() < olderThanTimestamp; |
| 323 | + listDirRecursivelyWithFileIO( |
| 324 | + (SupportsPrefixOperations) table.io(), location, predicate, pathFilter, matchingFiles); |
| 325 | + |
| 326 | + JavaRDD<String> matchingFileRDD = sparkContext().parallelize(matchingFiles, 1); |
324 | 327 | return spark().createDataset(matchingFileRDD.rdd(), Encoders.STRING()); |
| 328 | + } else { |
| 329 | + Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp; |
| 330 | + // list at most MAX_DRIVER_LISTING_DEPTH levels and only dirs that have |
| 331 | + // less than MAX_DRIVER_LISTING_DIRECT_SUB_DIRS direct sub dirs on the driver |
| 332 | + listDirRecursivelyWithHadoop( |
| 333 | + location, |
| 334 | + predicate, |
| 335 | + hadoopConf.value(), |
| 336 | + MAX_DRIVER_LISTING_DEPTH, |
| 337 | + MAX_DRIVER_LISTING_DIRECT_SUB_DIRS, |
| 338 | + subDirs, |
| 339 | + pathFilter, |
| 340 | + matchingFiles); |
| 341 | + |
| 342 | + JavaRDD<String> matchingFileRDD = sparkContext().parallelize(matchingFiles, 1); |
| 343 | + |
| 344 | + if (subDirs.isEmpty()) { |
| 345 | + return spark().createDataset(matchingFileRDD.rdd(), Encoders.STRING()); |
| 346 | + } |
| 347 | + |
| 348 | + int parallelism = Math.min(subDirs.size(), listingParallelism); |
| 349 | + JavaRDD<String> subDirRDD = sparkContext().parallelize(subDirs, parallelism); |
| 350 | + |
| 351 | + Broadcast<SerializableConfiguration> conf = sparkContext().broadcast(hadoopConf); |
| 352 | + ListDirsRecursively listDirs = new ListDirsRecursively(conf, olderThanTimestamp, pathFilter); |
| 353 | + JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirs); |
| 354 | + |
| 355 | + JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); |
| 356 | + return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()); |
| 357 | + } |
| 358 | + } |
| 359 | + |
| 360 | + private static void listDirRecursivelyWithFileIO( |
| 361 | + SupportsPrefixOperations io, |
| 362 | + String dir, |
| 363 | + Predicate<org.apache.iceberg.io.FileInfo> predicate, |
| 364 | + PathFilter pathFilter, |
| 365 | + List<String> matchingFiles) { |
| 366 | + String listPath = dir; |
| 367 | + if (!dir.endsWith("/")) { |
| 368 | + listPath = dir + "/"; |
| 369 | + } |
| 370 | + |
| 371 | + Iterable<org.apache.iceberg.io.FileInfo> files = io.listPrefix(listPath); |
| 372 | + for (org.apache.iceberg.io.FileInfo file : files) { |
| 373 | + Path path = new Path(file.location()); |
| 374 | + if (!isHiddenPath(dir, path, pathFilter) && predicate.test(file)) { |
| 375 | + matchingFiles.add(file.location()); |
| 376 | + } |
325 | 377 | } |
| 378 | + } |
326 | 379 |
|
327 | | - int parallelism = Math.min(subDirs.size(), listingParallelism); |
328 | | - JavaRDD<String> subDirRDD = sparkContext().parallelize(subDirs, parallelism); |
| 380 | + private static boolean isHiddenPath(String baseDir, Path path, PathFilter pathFilter) { |
| 381 | + boolean isHiddenPath = false; |
| 382 | + Path currentPath = path; |
| 383 | + while (currentPath.getParent().toString().contains(baseDir)) { |
| 384 | + if (!pathFilter.accept(currentPath)) { |
| 385 | + isHiddenPath = true; |
| 386 | + break; |
| 387 | + } |
329 | 388 |
|
330 | | - Broadcast<SerializableConfiguration> conf = sparkContext().broadcast(hadoopConf); |
331 | | - ListDirsRecursively listDirs = new ListDirsRecursively(conf, olderThanTimestamp, pathFilter); |
332 | | - JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirs); |
| 389 | + currentPath = currentPath.getParent(); |
| 390 | + } |
333 | 391 |
|
334 | | - JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); |
335 | | - return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()); |
| 392 | + return isHiddenPath; |
336 | 393 | } |
337 | 394 |
|
338 | | - private static void listDirRecursively( |
| 395 | + @VisibleForTesting |
| 396 | + static void listDirRecursivelyWithHadoop( |
339 | 397 | String dir, |
340 | 398 | Predicate<FileStatus> predicate, |
341 | 399 | Configuration conf, |
@@ -372,7 +430,7 @@ private static void listDirRecursively( |
372 | 430 | } |
373 | 431 |
|
374 | 432 | for (String subDir : subDirs) { |
375 | | - listDirRecursively( |
| 433 | + listDirRecursivelyWithHadoop( |
376 | 434 | subDir, |
377 | 435 | predicate, |
378 | 436 | conf, |
@@ -458,7 +516,7 @@ public Iterator<String> call(Iterator<String> dirs) throws Exception { |
458 | 516 | Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp; |
459 | 517 |
|
460 | 518 | while (dirs.hasNext()) { |
461 | | - listDirRecursively( |
| 519 | + listDirRecursivelyWithHadoop( |
462 | 520 | dirs.next(), |
463 | 521 | predicate, |
464 | 522 | hadoopConf.value().value(), |
|
0 commit comments