Skip to content

Commit f89dc40

Browse files
committed
Support Iceberg procedure rewrite_data_files
1 parent 3d1560a commit f89dc40

File tree

3 files changed

+721
-0
lines changed

3 files changed

+721
-0
lines changed

presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergCommonModule.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ public void setup(Binder binder)
151151
procedures.addBinding().toProvider(RegisterTableProcedure.class).in(Scopes.SINGLETON);
152152
procedures.addBinding().toProvider(UnregisterTableProcedure.class).in(Scopes.SINGLETON);
153153
procedures.addBinding().toProvider(ExpireSnapshotsProcedure.class).in(Scopes.SINGLETON);
154+
procedures.addBinding().toProvider(RewriteDataFilesProcedure.class).in(Scopes.SINGLETON);
154155

155156
// for orc
156157
binder.bind(EncryptionLibrary.class).annotatedWith(HiveDwrfEncryptionProvider.ForCryptoService.class).to(UnsupportedEncryptionLibrary.class).in(Scopes.SINGLETON);
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package com.facebook.presto.iceberg;
15+
16+
import com.facebook.airlift.json.JsonCodec;
17+
import com.facebook.presto.common.predicate.TupleDomain;
18+
import com.facebook.presto.common.type.TypeManager;
19+
import com.facebook.presto.spi.ConnectorDistributedProcedureHandle;
20+
import com.facebook.presto.spi.ConnectorSession;
21+
import com.facebook.presto.spi.ConnectorSplitSource;
22+
import com.facebook.presto.spi.FixedSplitSource;
23+
import com.facebook.presto.spi.classloader.ThreadContextClassLoader;
24+
import com.facebook.presto.spi.procedure.DistributedProcedure;
25+
import com.facebook.presto.spi.procedure.Procedure;
26+
import com.google.common.base.VerifyException;
27+
import com.google.common.collect.ImmutableList;
28+
import com.google.common.collect.ImmutableSet;
29+
import io.airlift.slice.Slice;
30+
import org.apache.iceberg.DataFile;
31+
import org.apache.iceberg.DataFiles;
32+
import org.apache.iceberg.DeleteFile;
33+
import org.apache.iceberg.FileContent;
34+
import org.apache.iceberg.FileScanTask;
35+
import org.apache.iceberg.PartitionSpecParser;
36+
import org.apache.iceberg.RewriteFiles;
37+
import org.apache.iceberg.SchemaParser;
38+
import org.apache.iceberg.Snapshot;
39+
import org.apache.iceberg.Table;
40+
import org.apache.iceberg.TableScan;
41+
import org.apache.iceberg.types.Type;
42+
import org.apache.iceberg.util.TableScanUtil;
43+
44+
import javax.inject.Inject;
45+
import javax.inject.Provider;
46+
47+
import java.util.Collection;
48+
import java.util.HashSet;
49+
import java.util.List;
50+
import java.util.Optional;
51+
import java.util.Set;
52+
import java.util.function.Consumer;
53+
54+
import static com.facebook.presto.common.type.StandardTypes.VARCHAR;
55+
import static com.facebook.presto.hive.rule.FilterPushdownUtils.isEntireColumn;
56+
import static com.facebook.presto.iceberg.ExpressionConverter.toIcebergExpression;
57+
import static com.facebook.presto.iceberg.IcebergSessionProperties.getMinimumAssignedSplitWeight;
58+
import static com.facebook.presto.iceberg.IcebergSessionProperties.isPushdownFilterEnabled;
59+
import static com.facebook.presto.iceberg.IcebergUtil.getColumns;
60+
import static com.facebook.presto.iceberg.IcebergUtil.getFileFormat;
61+
import static com.facebook.presto.spi.procedure.DistributedProcedure.SCHEMA;
62+
import static com.facebook.presto.spi.procedure.DistributedProcedure.TABLE_NAME;
63+
import static com.google.common.collect.ImmutableList.toImmutableList;
64+
import static java.util.Objects.requireNonNull;
65+
66+
public class RewriteDataFilesProcedure
67+
implements Provider<Procedure>
68+
{
69+
TypeManager typeManager;
70+
JsonCodec<CommitTaskData> commitTaskCodec;
71+
72+
@Inject
73+
public RewriteDataFilesProcedure(
74+
TypeManager typeManager,
75+
JsonCodec<CommitTaskData> commitTaskCodec)
76+
{
77+
this.typeManager = requireNonNull(typeManager, "typeManager is null");
78+
this.commitTaskCodec = requireNonNull(commitTaskCodec, "commitTaskCodec is null");
79+
}
80+
81+
@Override
82+
public Procedure get()
83+
{
84+
return new DistributedProcedure(
85+
"system",
86+
"rewrite_data_files",
87+
ImmutableList.of(
88+
new Procedure.Argument(SCHEMA, VARCHAR),
89+
new Procedure.Argument(TABLE_NAME, VARCHAR),
90+
new Procedure.Argument("filter", VARCHAR, false, "TRUE"),
91+
new Procedure.Argument("options", "map(varchar, varchar)", false, null)),
92+
(session, transactionContext, tableLayoutHandle, arguments) -> beginCallDistributedProcedure(session, (IcebergTransactionContext) transactionContext, (IcebergTableLayoutHandle) tableLayoutHandle, arguments),
93+
((transactionContext, tableHandle, fragments) -> finishCallDistributedProcedure((IcebergTransactionContext) transactionContext, tableHandle, fragments)));
94+
}
95+
96+
private ConnectorDistributedProcedureHandle beginCallDistributedProcedure(ConnectorSession session, IcebergTransactionContext transactionContext, IcebergTableLayoutHandle layoutHandle, Object[] arguments)
97+
{
98+
try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) {
99+
Table icebergTable = transactionContext.getTable().orElseThrow(() -> new VerifyException("No partition data for partitioned table"));
100+
IcebergTableHandle tableHandle = layoutHandle.getTable();
101+
102+
ConnectorSplitSource splitSource;
103+
if (!tableHandle.getIcebergTableName().getSnapshotId().isPresent()) {
104+
splitSource = new FixedSplitSource(ImmutableList.of());
105+
}
106+
else {
107+
TupleDomain<IcebergColumnHandle> predicate = isPushdownFilterEnabled(session) ?
108+
layoutHandle.getPartitionColumnPredicate()
109+
.transform(IcebergColumnHandle.class::cast)
110+
.intersect(layoutHandle.getDomainPredicate()
111+
.transform(subfield -> isEntireColumn(subfield) ? subfield.getRootName() : null)
112+
.transform(layoutHandle.getPredicateColumns()::get)) :
113+
tableHandle.getPredicate();
114+
TableScan tableScan = icebergTable.newScan()
115+
.filter(toIcebergExpression(predicate))
116+
.useSnapshot(tableHandle.getIcebergTableName().getSnapshotId().get());
117+
118+
Consumer<FileScanTask> fileScanTaskConsumer = (task) -> {
119+
transactionContext.getScannedDataFiles().add(task.file());
120+
if (!task.deletes().isEmpty()) {
121+
task.deletes().forEach(deleteFile -> {
122+
if (deleteFile.content() == FileContent.EQUALITY_DELETES &&
123+
!icebergTable.specs().get(deleteFile.specId()).isPartitioned() &&
124+
!predicate.isAll()) {
125+
// Equality files with an unpartitioned spec are applied as global deletes
126+
// So they should not be cleaned up unless the whole table is optimized
127+
return;
128+
}
129+
transactionContext.getFullyAppliedDeleteFiles().add(deleteFile);
130+
});
131+
transactionContext.getFullyAppliedDeleteFiles().addAll(task.deletes());
132+
}
133+
};
134+
135+
splitSource = new CallDistributedProcedureSplitSource(
136+
session,
137+
tableScan,
138+
TableScanUtil.splitFiles(tableScan.planFiles(), tableScan.targetSplitSize()),
139+
Optional.of(fileScanTaskConsumer),
140+
getMinimumAssignedSplitWeight(session));
141+
}
142+
transactionContext.setConnectorSplitSource(splitSource);
143+
144+
return new IcebergDistributedProcedureHandle(
145+
tableHandle.getSchemaName(),
146+
tableHandle.getIcebergTableName(),
147+
SchemaParser.toJson(icebergTable.schema()),
148+
PartitionSpecParser.toJson(icebergTable.spec()),
149+
getColumns(icebergTable.schema(), icebergTable.spec(), typeManager),
150+
icebergTable.location(),
151+
getFileFormat(icebergTable),
152+
icebergTable.properties());
153+
}
154+
}
155+
156+
private void finishCallDistributedProcedure(IcebergTransactionContext transactionContext, ConnectorDistributedProcedureHandle procedureHandle, Collection<Slice> fragments)
157+
{
158+
if (fragments.isEmpty() &&
159+
transactionContext.getScannedDataFiles().isEmpty() &&
160+
transactionContext.getFullyAppliedDeleteFiles().isEmpty()) {
161+
return;
162+
}
163+
164+
try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) {
165+
IcebergDistributedProcedureHandle handle = (IcebergDistributedProcedureHandle) procedureHandle;
166+
Table icebergTable = transactionContext.getTransaction().table();
167+
168+
List<CommitTaskData> commitTasks = fragments.stream()
169+
.map(slice -> commitTaskCodec.fromJson(slice.getBytes()))
170+
.collect(toImmutableList());
171+
172+
org.apache.iceberg.types.Type[] partitionColumnTypes = icebergTable.spec().fields().stream()
173+
.map(field -> field.transform().getResultType(
174+
icebergTable.schema().findType(field.sourceId())))
175+
.toArray(Type[]::new);
176+
177+
Set<DataFile> newFiles = new HashSet<>();
178+
for (CommitTaskData task : commitTasks) {
179+
DataFiles.Builder builder = DataFiles.builder(icebergTable.spec())
180+
.withPath(task.getPath())
181+
.withFileSizeInBytes(task.getFileSizeInBytes())
182+
.withFormat(handle.getFileFormat().name())
183+
.withMetrics(task.getMetrics().metrics());
184+
185+
if (!icebergTable.spec().fields().isEmpty()) {
186+
String partitionDataJson = task.getPartitionDataJson()
187+
.orElseThrow(() -> new VerifyException("No partition data for partitioned table"));
188+
builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes));
189+
}
190+
newFiles.add(builder.build());
191+
}
192+
193+
RewriteFiles rewriteFiles = transactionContext.getTransaction().newRewrite();
194+
Set<DataFile> scannedDataFiles = transactionContext.getScannedDataFiles();
195+
Set<DeleteFile> fullyAppliedDeleteFiles = transactionContext.getFullyAppliedDeleteFiles();
196+
rewriteFiles.rewriteFiles(scannedDataFiles, fullyAppliedDeleteFiles, newFiles, ImmutableSet.of());
197+
198+
// Table.snapshot method returns null if there is no matching snapshot
199+
Snapshot snapshot = requireNonNull(
200+
handle.getTableName()
201+
.getSnapshotId()
202+
.map(icebergTable::snapshot)
203+
.orElse(null),
204+
"snapshot is null");
205+
if (icebergTable.currentSnapshot() != null) {
206+
rewriteFiles.validateFromSnapshot(snapshot.snapshotId());
207+
}
208+
rewriteFiles.commit();
209+
}
210+
}
211+
}

0 commit comments

Comments
 (0)