Skip to content

Commit d24f722

Browse files
apurva-metafacebook-github-bot
authored andcommitted
feat: [presto][iceberg] Add PUFFIN file format support for deletion vector discovery (#27393)
Summary: Iceberg V3 introduces deletion vectors stored as blobs inside Puffin files. Previously, the coordinator's IcebergSplitSource rejected PUFFIN-format delete files with a NOT_SUPPORTED error, preventing V3 deletion vectors from being discovered and sent to workers. This diff: 1. Adds PUFFIN to the FileFormat enum (both presto-trunk and presto-facebook-trunk) so fromIcebergFileFormat() can convert Iceberg's PUFFIN format to Presto's FileFormat.PUFFIN. 2. Removes the PUFFIN rejection check in presto-trunk's IcebergSplitSource.toIcebergSplit(), allowing deletion vector files to flow through to workers. 3. Updates TestIcebergV3 to verify PUFFIN files are accepted rather than rejected at split enumeration time. The C++ worker-side changes (protocol enum + connector conversion) will follow in a separate diff. Differential Revision: D97531557
1 parent d5dc345 commit d24f722

File tree

3 files changed

+22
-13
lines changed

3 files changed

+22
-13
lines changed

presto-iceberg/src/main/java/com/facebook/presto/iceberg/FileFormat.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ public enum FileFormat
2626
ORC("orc", true),
2727
PARQUET("parquet", true),
2828
AVRO("avro", true),
29-
METADATA("metadata.json", false);
29+
METADATA("metadata.json", false),
30+
PUFFIN("puffin", false);
3031

3132
private final String ext;
3233
private final boolean splittable;
@@ -61,6 +62,9 @@ public static FileFormat fromIcebergFileFormat(org.apache.iceberg.FileFormat for
6162
case METADATA:
6263
prestoFileFormat = METADATA;
6364
break;
65+
case PUFFIN:
66+
prestoFileFormat = PUFFIN;
67+
break;
6468
default:
6569
throw new PrestoException(NOT_SUPPORTED, "Unsupported file format: " + format);
6670
}

presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSplitSource.java

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import com.facebook.presto.spi.ConnectorSession;
1919
import com.facebook.presto.spi.ConnectorSplit;
2020
import com.facebook.presto.spi.ConnectorSplitSource;
21-
import com.facebook.presto.spi.PrestoException;
2221
import com.facebook.presto.spi.SplitWeight;
2322
import com.facebook.presto.spi.connector.ConnectorPartitionHandle;
2423
import com.facebook.presto.spi.schedule.NodeSelectionStrategy;
@@ -47,7 +46,6 @@
4746
import static com.facebook.presto.iceberg.IcebergUtil.getTargetSplitSize;
4847
import static com.facebook.presto.iceberg.IcebergUtil.metadataColumnsMatchPredicates;
4948
import static com.facebook.presto.iceberg.IcebergUtil.partitionDataFromStructLike;
50-
import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED;
5149
import static com.google.common.collect.ImmutableList.toImmutableList;
5250
import static com.google.common.collect.Iterators.limit;
5351
import static java.util.Objects.requireNonNull;
@@ -126,13 +124,6 @@ private ConnectorSplit toIcebergSplit(FileScanTask task)
126124
PartitionSpec spec = task.spec();
127125
Optional<PartitionData> partitionData = partitionDataFromStructLike(spec, task.file().partition());
128126

129-
// Validate no PUFFIN deletion vectors (Iceberg v3 feature not yet supported)
130-
for (org.apache.iceberg.DeleteFile deleteFile : task.deletes()) {
131-
if (deleteFile.format() == org.apache.iceberg.FileFormat.PUFFIN) {
132-
throw new PrestoException(NOT_SUPPORTED, "Iceberg deletion vectors (PUFFIN format) are not supported");
133-
}
134-
}
135-
136127
// TODO: We should leverage residual expression and convert that to TupleDomain.
137128
// The predicate here is used by readers for predicate push down at reader level,
138129
// so when we do not use residual expression, we are just wasting CPU cycles

presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergV3.java

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import static java.lang.String.format;
4545
import static org.assertj.core.api.Assertions.assertThatThrownBy;
4646
import static org.testng.Assert.assertEquals;
47+
import static org.testng.Assert.assertFalse;
4748

4849
public class TestIcebergV3
4950
extends AbstractTestQueryFramework
@@ -279,10 +280,10 @@ public void testOptimizeOnV3Table()
279280
}
280281

281282
@Test
282-
public void testPuffinDeletionVectorsNotSupported()
283+
public void testPuffinDeletionVectorsAccepted()
283284
throws Exception
284285
{
285-
String tableName = "test_puffin_deletion_vectors_not_supported";
286+
String tableName = "test_puffin_deletion_vectors_accepted";
286287
try {
287288
assertUpdate("CREATE TABLE " + tableName + " (id integer, value varchar) WITH (\"format-version\" = '3')");
288289
assertUpdate("INSERT INTO " + tableName + " VALUES (1, 'one'), (2, 'two')", 2);
@@ -309,7 +310,20 @@ public void testPuffinDeletionVectorsNotSupported()
309310
.commit();
310311
}
311312

312-
assertQueryFails("SELECT * FROM " + tableName, "Iceberg deletion vectors.*PUFFIN.*not supported");
313+
// The PUFFIN delete file is now accepted by the split source (no longer
314+
// throws NOT_SUPPORTED). The query will fail downstream because the fake
315+
// .puffin file doesn't exist on disk, but the important thing is that the
316+
// coordinator no longer rejects it at split enumeration time.
317+
try {
318+
computeActual("SELECT * FROM " + tableName);
319+
}
320+
catch (RuntimeException e) {
321+
// Verify the error is NOT the old "PUFFIN not supported" rejection.
322+
// Other failures (e.g., fake .puffin file not on disk) are acceptable.
323+
assertFalse(
324+
e.getMessage().contains("Iceberg deletion vectors") && e.getMessage().contains("not supported"),
325+
"PUFFIN deletion vectors should be accepted, not rejected: " + e.getMessage());
326+
}
313327
}
314328
finally {
315329
dropTable(tableName);

0 commit comments

Comments
 (0)