Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ public org.apache.iceberg.Schema getRequiredSchema() {
if (cachedRequiredSchema == null) {
cachedRequiredSchema =
resolveSchema(
getTable().schema(),
IcebergUtils.beamSchemaToIcebergSchema(getSchema()),
getKeepFields(),
getDropFields(),
FilterUtils.getReferencedFieldNames(getFilterString()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@
* source creates a single range, while the unbounded implementation continuously polls for new
* snapshots at the specified interval.
*/
class IncrementalScanSource extends PTransform<PBegin, PCollection<Row>> {
public class IncrementalScanSource extends PTransform<PBegin, PCollection<Row>> {
private static final Duration DEFAULT_POLL_INTERVAL = Duration.standardSeconds(60);
private final IcebergScanConfig scanConfig;
protected final IcebergScanConfig scanConfig;

IncrementalScanSource(IcebergScanConfig scanConfig) {
public IncrementalScanSource(IcebergScanConfig scanConfig) {
this.scanConfig = scanConfig;
}

Expand Down Expand Up @@ -74,14 +74,15 @@ public PCollection<Row> expand(PBegin input) {
}

/** Continuously watches for new snapshots. */
private PCollection<KV<String, List<SnapshotInfo>>> unboundedSnapshots(PBegin input) {
protected PCollection<KV<String, List<SnapshotInfo>>> unboundedSnapshots(PBegin input) {
Duration pollInterval =
MoreObjects.firstNonNull(scanConfig.getPollInterval(), DEFAULT_POLL_INTERVAL);
return input.apply("Watch for Snapshots", new WatchForSnapshots(scanConfig, pollInterval));
}

/** Creates a fixed snapshot range. */
private PCollection<KV<String, List<SnapshotInfo>>> boundedSnapshots(PBegin input, Table table) {
protected PCollection<KV<String, List<SnapshotInfo>>> boundedSnapshots(
PBegin input, Table table) {
checkStateNotNull(
table.currentSnapshot().snapshotId(),
"Table %s does not have any snapshots to read from.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,15 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps;
import org.apache.iceberg.ContentFile;
import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.PartitionField;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.checkerframework.checker.nullness.qual.Nullable;

class PartitionUtils {
Expand Down Expand Up @@ -90,4 +97,51 @@ static PartitionSpec toPartitionSpec(

return builder.build();
}

/**
* Copied over from Apache Iceberg's <a
* href="https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/util/PartitionUtil.java">PartitionUtil</a>
*/
public static Map<Integer, ?> constantsMap(
PartitionSpec spec, ContentFile<?> file, BiFunction<Type, Object, Object> convertConstant) {
StructLike partitionData = file.partition();

// use java.util.HashMap because partition data may contain null values
Map<Integer, Object> idToConstant = Maps.newHashMap();

// add first_row_id as _row_id
if (file.firstRowId() != null) {
idToConstant.put(
MetadataColumns.ROW_ID.fieldId(),
convertConstant.apply(Types.LongType.get(), file.firstRowId()));
}

idToConstant.put(
MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId(),
convertConstant.apply(Types.LongType.get(), file.fileSequenceNumber()));

// add _file
idToConstant.put(
MetadataColumns.FILE_PATH.fieldId(),
convertConstant.apply(Types.StringType.get(), file.location()));

// add _spec_id
idToConstant.put(
MetadataColumns.SPEC_ID.fieldId(),
convertConstant.apply(Types.IntegerType.get(), file.specId()));

List<Types.NestedField> partitionFields = spec.partitionType().fields();
List<PartitionField> fields = spec.fields();
for (int pos = 0; pos < fields.size(); pos += 1) {
PartitionField field = fields.get(pos);
if (field.transform().isIdentity()) {
Object converted =
convertConstant.apply(
partitionFields.get(pos).type(), partitionData.get(pos, Object.class));
idToConstant.put(field.sourceId(), converted);
}
}

return idToConstant;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,7 @@ public void process(
}
FileScanTask task = fileScanTasks.get((int) l);
Schema beamSchema = IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema());
try (CloseableIterable<Record> fullIterable =
ReadUtils.createReader(task, table, scanConfig.getRequiredSchema())) {
CloseableIterable<Record> reader = ReadUtils.maybeApplyFilter(fullIterable, scanConfig);
try (CloseableIterable<Record> reader = ReadUtils.createReader(task, table, scanConfig)) {

for (Record record : reader) {
Row row = IcebergUtils.icebergRecordToBeamRow(beamSchema, record);
Expand Down
Loading
Loading