-
Notifications
You must be signed in to change notification settings - Fork 170
Redesign applySchemaDeltas to only apply added columns to catalog #3768
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,7 +6,6 @@ | |
| "errors" | ||
| "fmt" | ||
| "log/slog" | ||
| "slices" | ||
| "sync/atomic" | ||
| "time" | ||
|
|
||
|
|
@@ -64,31 +63,57 @@ | |
|
|
||
| func (a *FlowableActivity) applySchemaDeltas( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. did a quick code search and i see a few other places that are using
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good call out, will take a look |
||
| ctx context.Context, | ||
| config *protos.FlowConnectionConfigsCore, | ||
| options *protos.SyncFlowOptions, | ||
| flowJobName string, | ||
| schemaDeltas []*protos.TableSchemaDelta, | ||
| ) error { | ||
| filteredTableMappings := make([]*protos.TableMapping, 0, len(schemaDeltas)) | ||
| for _, tableMapping := range options.TableMappings { | ||
| if slices.ContainsFunc(schemaDeltas, func(schemaDelta *protos.TableSchemaDelta) bool { | ||
| return schemaDelta.SrcTableName == tableMapping.SourceTableIdentifier && | ||
| schemaDelta.DstTableName == tableMapping.DestinationTableIdentifier | ||
| }) { | ||
| filteredTableMappings = append(filteredTableMappings, tableMapping) | ||
| } | ||
| } | ||
|
|
||
| if len(schemaDeltas) > 0 { | ||
| if err := a.SetupTableSchema(ctx, &protos.SetupTableSchemaBatchInput{ | ||
| PeerName: config.SourceName, | ||
| TableMappings: filteredTableMappings, | ||
| FlowName: config.FlowJobName, | ||
| System: config.System, | ||
| Env: config.Env, | ||
| Version: config.Version, | ||
| }); err != nil { | ||
| return a.Alerter.LogFlowError(ctx, config.FlowJobName, fmt.Errorf("failed to execute schema update at source: %w", err)) | ||
| logger := internal.LoggerFromCtx(ctx) | ||
| destinationNameListInDeltas := make([]string, 0, len(schemaDeltas)) | ||
| for _, tableSchemaDelta := range schemaDeltas { | ||
| destinationNameListInDeltas = append(destinationNameListInDeltas, tableSchemaDelta.DstTableName) | ||
| } | ||
| logger.Info("loading table schemas from catalog for applying schema deltas", | ||
| slog.Int("numDeltas", len(schemaDeltas)), | ||
| slog.String("flowJobName", flowJobName), | ||
| ) | ||
| schemasInCatalog, err := internal.LoadTableSchemasFromCatalog(ctx, a.CatalogPool, flowJobName, destinationNameListInDeltas) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to load table schemas from catalog for applying schema deltas: %w", err) | ||
| } | ||
| for _, tableSchemaDelta := range schemaDeltas { | ||
| columnsInCatalog := schemasInCatalog[tableSchemaDelta.DstTableName].GetColumns() | ||
| addedColumns := tableSchemaDelta.GetAddedColumns() | ||
|
|
||
| // Create a map to track existing column names to avoid duplicates | ||
| existingColumnNames := make(map[string]bool) | ||
| for _, col := range columnsInCatalog { | ||
| existingColumnNames[col.Name] = true | ||
| } | ||
|
|
||
| // Only add columns that don't already exist | ||
| var newColumnsToAdd []*protos.FieldDescription | ||
| for _, addedCol := range addedColumns { | ||
| if !existingColumnNames[addedCol.Name] { | ||
| newColumnsToAdd = append(newColumnsToAdd, addedCol) | ||
| existingColumnNames[addedCol.Name] = true | ||
| } | ||
| } | ||
| updatedColumnsInCatalog := append(columnsInCatalog, newColumnsToAdd...) | ||
| schemasInCatalog[tableSchemaDelta.DstTableName].Columns = updatedColumnsInCatalog | ||
| } | ||
| logger.Info("applying schema deltas to catalog", | ||
| slog.Int("numTables", len(schemasInCatalog)), | ||
| slog.Int("numDeltas", len(schemaDeltas)), | ||
| slog.String("flowJobName", flowJobName), | ||
| ) | ||
| err = internal.UpdateTableSchemasInCatalog( | ||
| ctx, | ||
| a.CatalogPool, | ||
| logger, | ||
| flowJobName, | ||
| schemasInCatalog, | ||
| ) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to update table schemas in catalog: %w", err) | ||
| } | ||
| return nil | ||
| } | ||
|
|
@@ -223,7 +248,7 @@ | |
| return nil, fmt.Errorf("failed to sync schema: %w", err) | ||
| } | ||
|
|
||
| return nil, a.applySchemaDeltas(ctx, config, options, recordBatchSync.SchemaDeltas) | ||
| return nil, a.applySchemaDeltas(ctx, config.FlowJobName, recordBatchSync.SchemaDeltas) | ||
| } | ||
|
|
||
| var res *model.SyncResponse | ||
|
|
@@ -319,7 +344,7 @@ | |
| a.OtelManager.Metrics.CurrentBatchIdGauge.Record(ctx, res.CurrentSyncBatchID) | ||
|
|
||
| syncState.Store(shared.Ptr("updating schema")) | ||
| if err := a.applySchemaDeltas(ctx, config, options, res.TableSchemaDeltas); err != nil { | ||
| if err := a.applySchemaDeltas(ctx, config.FlowJobName, res.TableSchemaDeltas); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For
MigratePostgresTableOIDs:iiuc, on pause-restart, we also sync what's in the source db to our catalog. so there may be a chance here too that if there are schema changes between pause/resume, we would end up syncing catalog to the latest schema, and ApplySchemaDelta could miss column additions.
Im wondering what is the motivation here for always syncing schema to latest on pause/restart, vs. just letting CDC do the catch-up and ApplySchemaDelta.
Note this doesn't block your PR, just an observation and maybe a follow-up item.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually this activity doesn't touch Postgres, it moves the OIDs from the
stateobject to catalog and the OIDs in state are populated during initial setup flow and can be extended by setup flows of table additions, but is independent of schema changes so we should be good?P.S: This migration is what enables table cancellation addition (added recently), it can be removed after a certain period of time