-
Notifications
You must be signed in to change notification settings - Fork 156
Add support for row lineage in v3 #735
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 6 commits
c443068
ff6d1e6
60a3ef6
8f304a4
6a97c4d
b21cd14
a0707e9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use it except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| package iceberg | ||
|
|
||
| // Row lineage metadata column field IDs (v3+). Reserved IDs are Integer.MAX_VALUE - 107 and 108 | ||
| // per the Iceberg spec (Metadata Columns / Row Lineage). | ||
| const ( | ||
| // RowIDFieldID is the field ID for _row_id (optional long). A unique long identifier for every row. | ||
| RowIDFieldID = 2147483540 | ||
| // LastUpdatedSequenceNumberFieldID is the field ID for _last_updated_sequence_number (optional long). | ||
| // The sequence number of the commit that last updated the row. | ||
| LastUpdatedSequenceNumberFieldID = 2147483539 | ||
| ) | ||
|
|
||
| // Row lineage metadata column names (v3+). | ||
| const ( | ||
| RowIDColumnName = "_row_id" | ||
| LastUpdatedSequenceNumberColumnName = "_last_updated_sequence_number" | ||
| ) | ||
|
|
||
| // RowID returns a NestedField for _row_id (optional long) for use in schemas that include row lineage. | ||
| func RowID() NestedField { | ||
| return NestedField{ | ||
| ID: RowIDFieldID, | ||
| Name: RowIDColumnName, | ||
| Required: false, | ||
| Doc: "Implicit row ID that is automatically assigned", | ||
| Type: Int64Type{}, | ||
| } | ||
| } | ||
|
|
||
| // LastUpdatedSequenceNumber returns a NestedField for _last_updated_sequence_number (optional long). | ||
| func LastUpdatedSequenceNumber() NestedField { | ||
| return NestedField{ | ||
| ID: LastUpdatedSequenceNumberFieldID, | ||
| Name: LastUpdatedSequenceNumberColumnName, | ||
| Required: false, | ||
| Doc: "Sequence number when the row was last updated", | ||
| Type: Int64Type{}, | ||
| } | ||
| } | ||
|
|
||
| // IsMetadataColumn returns true if the field ID is a reserved metadata column (e.g. row lineage). | ||
| func IsMetadataColumn(fieldID int) bool { | ||
| return fieldID == RowIDFieldID || fieldID == LastUpdatedSequenceNumberFieldID | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -385,6 +385,91 @@ func (as *arrowScan) getRecordFilter(ctx context.Context, fileSchema *iceberg.Sc | |
| return nil, false, nil | ||
| } | ||
|
|
||
| // synthesizeRowLineageColumns fills _row_id and _last_updated_sequence_number from task constants | ||
| // when those columns are present in the batch (e.g. from ToRequestedSchema). Per the Iceberg v3 | ||
| // row lineage spec: if the value is null in the file, it is inherited (synthesized) from the file's | ||
| // first_row_id and data_sequence_number; otherwise the value from the file is kept. | ||
| // rowOffset is the 0-based row index within the current file and is updated so _row_id stays | ||
| // correct across multiple batches from the same file (first_row_id + row_position). | ||
| func synthesizeRowLineageColumns( | ||
| ctx context.Context, | ||
| rowOffset *int64, | ||
| task FileScanTask, | ||
| batch arrow.RecordBatch, | ||
| ) (arrow.RecordBatch, error) { | ||
| alloc := compute.GetAllocator(ctx) | ||
| schema := batch.Schema() | ||
| ncols := int(batch.NumCols()) | ||
| nrows := batch.NumRows() | ||
| newCols := make([]arrow.Array, ncols) | ||
|
|
||
| // Resolve column indices by name; -1 if not present. | ||
| rowIDIndices := schema.FieldIndices(iceberg.RowIDColumnName) | ||
| seqNumIndices := schema.FieldIndices(iceberg.LastUpdatedSequenceNumberColumnName) | ||
| rowIDColIdx := -1 | ||
| if len(rowIDIndices) > 0 { | ||
| rowIDColIdx = rowIDIndices[0] | ||
| } | ||
| seqNumColIdx := -1 | ||
| if len(seqNumIndices) > 0 { | ||
| seqNumColIdx = seqNumIndices[0] | ||
| } | ||
|
|
||
| for i := 0; i < ncols; i++ { | ||
| if i == rowIDColIdx && task.FirstRowID != nil { | ||
|
||
| // _row_id: inherit first_row_id + row_position when null; else keep value from file. | ||
| if col, ok := batch.Column(i).(*array.Int64); ok { | ||
| bldr := array.NewInt64Builder(alloc) | ||
| first := *task.FirstRowID | ||
| for k := int64(0); k < nrows; k++ { | ||
| if col.IsNull(int(k)) { | ||
| bldr.Append(first + *rowOffset + k) | ||
| } else { | ||
| bldr.Append(col.Value(int(k))) | ||
| } | ||
| } | ||
| newCols[i] = bldr.NewArray() | ||
| bldr.Release() | ||
|
||
|
|
||
| continue | ||
| } | ||
| } | ||
|
|
||
| if i == seqNumColIdx && task.DataSequenceNumber != nil { | ||
|
||
| // _last_updated_sequence_number: inherit file's data_sequence_number when null; else keep value from file. | ||
| if col, ok := batch.Column(i).(*array.Int64); ok { | ||
| bldr := array.NewInt64Builder(alloc) | ||
| seq := *task.DataSequenceNumber | ||
| for k := int64(0); k < nrows; k++ { | ||
| if col.IsNull(int(k)) { | ||
| bldr.Append(seq) | ||
| } else { | ||
| bldr.Append(col.Value(int(k))) | ||
| } | ||
| } | ||
| newCols[i] = bldr.NewArray() | ||
| bldr.Release() | ||
|
||
|
|
||
| continue | ||
| } | ||
| } | ||
|
|
||
| col := batch.Column(i) | ||
| col.Retain() | ||
|
||
| newCols[i] = col | ||
| } | ||
|
|
||
| // Advance so the next batch from this file uses the correct row position for _row_id. | ||
| *rowOffset += nrows | ||
|
|
||
| rec := array.NewRecordBatch(schema, newCols, nrows) | ||
| for _, c := range newCols { | ||
| c.Release() | ||
| } | ||
|
|
||
| return rec, nil | ||
| } | ||
|
|
||
| func (as *arrowScan) processRecords( | ||
| ctx context.Context, | ||
| task internal.Enumerated[FileScanTask], | ||
|
|
@@ -513,6 +598,17 @@ func (as *arrowScan) recordsFromTask(ctx context.Context, task internal.Enumerat | |
| return ToRequestedSchema(ctx, as.projectedSchema, iceSchema, r, false, false, as.useLargeTypes) | ||
| }) | ||
|
|
||
| // Row lineage (v3): fill _row_id and _last_updated_sequence_number from task constants when in projection. | ||
| if task.Value.FirstRowID != nil || task.Value.DataSequenceNumber != nil { | ||
| var rowOffset int64 | ||
| taskVal := task.Value | ||
| pipeline = append(pipeline, func(r arrow.RecordBatch) (arrow.RecordBatch, error) { | ||
| defer r.Release() | ||
|
|
||
| return synthesizeRowLineageColumns(ctx, &rowOffset, taskVal, r) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I haven't double checked the spec, but should the row lineage columns be toggleable via a setting? i.e. a way to turn them off if you don't want them to show up in the results? |
||
| }) | ||
| } | ||
|
|
||
| err = as.processRecords(ctx, task, iceSchema, rdr, colIndices, pipeline, out) | ||
|
|
||
| return err | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same root cause as #762 — bldr.NewArray() starts at refcount=1, array.NewRecordBatch retains to refcount=2, but the local refs in newCols are never released. Fix needs a release loop after the batch is created:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for pointing out this . I see that the PR 762 has been approved and waiting to be merged. Let me know once it lands in
mainso that I can rebase and apply the fix for this PR