Spark: Remove dependency on hadoop's filesystem class from remove orphan files (apache#12254)

liziyan-lzy · web-flow · commit 8e803312cf82 · 2025-06-03T17:01:11.000+02:00
diff --git a/docs/docs/spark-procedures.md b/docs/docs/spark-procedures.md
@@ -317,6 +317,7 @@ Used to remove files which are not referenced in any metadata files of an Iceber
 | `equal_schemes` |    | map<string, string> | Mapping of file system schemes to be considered equal. Key is a comma-separated list of schemes and value is a scheme (defaults to `map('s3a,s3n','s3')`). |
 | `equal_authorities` |    | map<string, string> | Mapping of file system authorities to be considered equal. Key is a comma-separated list of authorities and value is an authority. |
 | `prefix_mismatch_mode` |    | string | Action behavior when location prefixes (schemes/authorities) mismatch: <ul><li>ERROR - throw an exception. (default) </li><li>IGNORE - no action.</li><li>DELETE - delete files.</li></ul> |  
+| `prefix_listing` |    | boolean   | When true, use prefix-based file listing via the `SupportsPrefixOperations` interface. The Table FileIO implementation must support `SupportsPrefixOperations` when this flag is enabled (defaults to false) |
 
 #### Output
 
@@ -370,6 +371,11 @@ CALL catalog_name.system.remove_orphan_files(table => 'db.sample', equal_schemes
 CALL catalog_name.system.remove_orphan_files(table => 'db.sample', equal_authorities => map('ns1', 'ns2'));
 ```
 
+List all the files that are candidates for removal using prefix listing.
+```sql
+CALL catalog_name.system.remove_orphan_files(table => 'db.sample', prefix_listing => true);
+```
+
 ### `rewrite_data_files`
 
 Iceberg tracks each data file in a table. More data files leads to more metadata stored in manifest files, and small data files causes an unnecessary amount of metadata and less efficient queries from file open costs.
diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java
@@ -50,6 +50,7 @@
 import org.apache.iceberg.hadoop.HiddenPathFilter;
 import org.apache.iceberg.io.BulkDeletionFailureException;
 import org.apache.iceberg.io.SupportsBulkOperations;
+import org.apache.iceberg.io.SupportsPrefixOperations;
 import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.relocated.com.google.common.base.Strings;
@@ -121,6 +122,7 @@ public class DeleteOrphanFilesSparkAction extends BaseSparkAction<DeleteOrphanFi
   private Dataset<Row> compareToFileList;
   private Consumer<String> deleteFunc = null;
   private ExecutorService deleteExecutorService = null;
+  private boolean usePrefixListing = false;
 
   DeleteOrphanFilesSparkAction(SparkSession spark, Table table) {
     super(spark);
@@ -206,6 +208,11 @@ public DeleteOrphanFilesSparkAction compareToFileList(Dataset<Row> files) {
     return this;
   }
 
+  public DeleteOrphanFilesSparkAction usePrefixListing(boolean newUsePrefixListing) {
+    this.usePrefixListing = newUsePrefixListing;
+    return this;
+  }
+
   private Dataset<String> filteredCompareToFileList() {
     Dataset<Row> files = compareToFileList;
     if (location != null) {
@@ -303,39 +310,90 @@ private Dataset<String> listedFileDS() {
     List<String> subDirs = Lists.newArrayList();
     List<String> matchingFiles = Lists.newArrayList();
 
-    Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;
     PathFilter pathFilter = PartitionAwareHiddenPathFilter.forSpecs(table.specs());
 
-    // list at most MAX_DRIVER_LISTING_DEPTH levels and only dirs that have
-    // less than MAX_DRIVER_LISTING_DIRECT_SUB_DIRS direct sub dirs on the driver
-    listDirRecursively(
-        location,
-        predicate,
-        hadoopConf.value(),
-        MAX_DRIVER_LISTING_DEPTH,
-        MAX_DRIVER_LISTING_DIRECT_SUB_DIRS,
-        subDirs,
-        pathFilter,
-        matchingFiles);
-
-    JavaRDD<String> matchingFileRDD = sparkContext().parallelize(matchingFiles, 1);
-
-    if (subDirs.isEmpty()) {
+    if (usePrefixListing) {
+      Preconditions.checkArgument(
+          table.io() instanceof SupportsPrefixOperations,
+          "Cannot use prefix listing with FileIO {} which does not support prefix operations.",
+          table.io());
+
+      Predicate<org.apache.iceberg.io.FileInfo> predicate =
+          fileInfo -> fileInfo.createdAtMillis() < olderThanTimestamp;
+      listDirRecursivelyWithFileIO(
+          (SupportsPrefixOperations) table.io(), location, predicate, pathFilter, matchingFiles);
+
+      JavaRDD<String> matchingFileRDD = sparkContext().parallelize(matchingFiles, 1);
       return spark().createDataset(matchingFileRDD.rdd(), Encoders.STRING());
+    } else {
+      Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;
+      // list at most MAX_DRIVER_LISTING_DEPTH levels and only dirs that have
+      // less than MAX_DRIVER_LISTING_DIRECT_SUB_DIRS direct sub dirs on the driver
+      listDirRecursivelyWithHadoop(
+          location,
+          predicate,
+          hadoopConf.value(),
+          MAX_DRIVER_LISTING_DEPTH,
+          MAX_DRIVER_LISTING_DIRECT_SUB_DIRS,
+          subDirs,
+          pathFilter,
+          matchingFiles);
+
+      JavaRDD<String> matchingFileRDD = sparkContext().parallelize(matchingFiles, 1);
+
+      if (subDirs.isEmpty()) {
+        return spark().createDataset(matchingFileRDD.rdd(), Encoders.STRING());
+      }
+
+      int parallelism = Math.min(subDirs.size(), listingParallelism);
+      JavaRDD<String> subDirRDD = sparkContext().parallelize(subDirs, parallelism);
+
+      Broadcast<SerializableConfiguration> conf = sparkContext().broadcast(hadoopConf);
+      ListDirsRecursively listDirs = new ListDirsRecursively(conf, olderThanTimestamp, pathFilter);
+      JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirs);
+
+      JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD);
+      return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING());
+    }
+  }
+
+  private static void listDirRecursivelyWithFileIO(
+      SupportsPrefixOperations io,
+      String dir,
+      Predicate<org.apache.iceberg.io.FileInfo> predicate,
+      PathFilter pathFilter,
+      List<String> matchingFiles) {
+    String listPath = dir;
+    if (!dir.endsWith("/")) {
+      listPath = dir + "/";
+    }
+
+    Iterable<org.apache.iceberg.io.FileInfo> files = io.listPrefix(listPath);
+    for (org.apache.iceberg.io.FileInfo file : files) {
+      Path path = new Path(file.location());
+      if (!isHiddenPath(dir, path, pathFilter) && predicate.test(file)) {
+        matchingFiles.add(file.location());
+      }
     }
+  }
 
-    int parallelism = Math.min(subDirs.size(), listingParallelism);
-    JavaRDD<String> subDirRDD = sparkContext().parallelize(subDirs, parallelism);
+  private static boolean isHiddenPath(String baseDir, Path path, PathFilter pathFilter) {
+    boolean isHiddenPath = false;
+    Path currentPath = path;
+    while (currentPath.getParent().toString().contains(baseDir)) {
+      if (!pathFilter.accept(currentPath)) {
+        isHiddenPath = true;
+        break;
+      }
 
-    Broadcast<SerializableConfiguration> conf = sparkContext().broadcast(hadoopConf);
-    ListDirsRecursively listDirs = new ListDirsRecursively(conf, olderThanTimestamp, pathFilter);
-    JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirs);
+      currentPath = currentPath.getParent();
+    }
 
-    JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD);
-    return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING());
+    return isHiddenPath;
   }
 
-  private static void listDirRecursively(
+  @VisibleForTesting
+  static void listDirRecursivelyWithHadoop(
       String dir,
       Predicate<FileStatus> predicate,
       Configuration conf,
@@ -372,7 +430,7 @@ private static void listDirRecursively(
       }
 
       for (String subDir : subDirs) {
-        listDirRecursively(
+        listDirRecursivelyWithHadoop(
             subDir,
             predicate,
             conf,
@@ -458,7 +516,7 @@ public Iterator<String> call(Iterator<String> dirs) throws Exception {
       Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;
 
       while (dirs.hasNext()) {
-        listDirRecursively(
+        listDirRecursivelyWithHadoop(
             dirs.next(),
             predicate,
             hadoopConf.value().value(),
diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java
@@ -63,6 +63,8 @@ public class RemoveOrphanFilesProcedure extends BaseProcedure {
         ProcedureParameter.optional("equal_schemes", STRING_MAP),
         ProcedureParameter.optional("equal_authorities", STRING_MAP),
         ProcedureParameter.optional("prefix_mismatch_mode", DataTypes.StringType),
+        // List files with prefix operations. Default is false.
+        ProcedureParameter.optional("prefix_listing", DataTypes.BooleanType)
       };
 
   private static final StructType OUTPUT_TYPE =
@@ -136,6 +138,8 @@ public InternalRow[] call(InternalRow args) {
     PrefixMismatchMode prefixMismatchMode =
         args.isNullAt(8) ? null : PrefixMismatchMode.fromString(args.getString(8));
 
+    boolean prefixListing = args.isNullAt(9) ? false : args.getBoolean(9);
+
     return withIcebergTable(
         tableIdent,
         table -> {
@@ -182,6 +186,8 @@ public InternalRow[] call(InternalRow args) {
             action.prefixMismatchMode(prefixMismatchMode);
           }
 
+          action.usePrefixListing(prefixListing);
+
           DeleteOrphanFiles.Result result = action.execute();
 
           return toOutputRows(result);
diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java