-
Notifications
You must be signed in to change notification settings - Fork 210
feat: mysql chunking optimisation #797
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: staging
Are you sure you want to change the base?
Changes from 6 commits
83ebf36
f5766f8
443cf94
6fc574c
c09aee8
53520de
3b9fbe7
8e4ba6a
1707ae1
feca5a0
ccfb371
fe4b4b2
910246a
1eacf5a
964a2ee
11a9f03
348c21a
94a6fd8
aa03463
bc1abf8
6e5e82f
d9189b9
69a1714
96b5689
66749dd
fa24a2c
8411568
64f31c1
debd4eb
86a2d91
8ead67e
0caf2aa
8ccfdd6
7754d72
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,15 +41,16 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, | |
| sort.Strings(pkColumns) | ||
|
|
||
| logger.Debugf("Starting backfill from %v to %v with filter: %s, args: %v", chunk.Min, chunk.Max, filter, args) | ||
| // Get chunks from state or calculate new ones | ||
| stmt := "" | ||
| // Get cxhunks from state or calculate new ones | ||
saksham-datazip marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| var stmt string | ||
| if chunkColumn != "" { | ||
| stmt = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) | ||
| } else if len(pkColumns) > 0 { | ||
| stmt = jdbc.MysqlChunkScanQuery(stream, pkColumns, chunk, filter) | ||
| } else { | ||
| stmt = jdbc.MysqlLimitOffsetScanQuery(stream, chunk, filter) | ||
| } | ||
|
|
||
saksham-datazip marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| logger.Debugf("Executing chunk query: %s", stmt) | ||
| setter := jdbc.NewReader(ctx, stmt, func(ctx context.Context, query string, queryArgs ...any) (*sql.Rows, error) { | ||
| return tx.QueryContext(ctx, query, args...) | ||
|
|
@@ -93,21 +94,39 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo | |
| chunkSize := int64(math.Ceil(float64(constants.EffectiveParquetSize) / avgRowSizeFloat)) | ||
| chunks := types.NewSet[types.Chunk]() | ||
| chunkColumn := stream.Self().StreamMetadata.ChunkColumn | ||
|
|
||
| var ( | ||
| isEvenDistribution bool | ||
| step int64 | ||
| minVal any //to define lower range of the chunk | ||
| maxVal any //to define upper range of the chunk | ||
| minFloat float64 | ||
| maxFloat float64 | ||
| ) | ||
|
|
||
| pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array() | ||
| if chunkColumn != "" { | ||
| pkColumns = []string{chunkColumn} | ||
| } | ||
| sort.Strings(pkColumns) | ||
|
|
||
| if stream.GetStream().SourceDefinedPrimaryKey.Len() > 0 || chunkColumn != "" { | ||
|
||
| err = jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { | ||
|
||
| var err error | ||
| minVal, maxVal, err = m.getTableExtremes(ctx, stream, pkColumns, tx) | ||
| return err | ||
| }) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to get table extremes: %s", err) | ||
| } | ||
| } | ||
| if len(pkColumns) == 1 { | ||
| isEvenDistribution, step, minFloat, maxFloat = shouldUseEvenDistribution(minVal, maxVal, approxRowCount, chunkSize) | ||
| } | ||
vishalm0509 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // Takes the user defined batch size as chunkSize | ||
| // TODO: common-out the chunking logic for db2, mssql, mysql | ||
| splitViaPrimaryKey := func(stream types.StreamInterface, chunks *types.Set[types.Chunk]) error { | ||
|
Comment on lines
+155
to
159
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. leave a line
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. solved |
||
| return jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { | ||
| // Get primary key column using the provided function | ||
| pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array() | ||
| if chunkColumn != "" { | ||
| pkColumns = []string{chunkColumn} | ||
| } | ||
| sort.Strings(pkColumns) | ||
| // Get table extremes | ||
| minVal, maxVal, err := m.getTableExtremes(ctx, stream, pkColumns, tx) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to get table extremes: %s", err) | ||
| } | ||
| if minVal == nil { | ||
| return nil | ||
| } | ||
|
|
@@ -180,7 +199,31 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo | |
| }) | ||
| } | ||
|
|
||
| if stream.GetStream().SourceDefinedPrimaryKey.Len() > 0 || chunkColumn != "" { | ||
| //used mathematical calculation to split the chunks for cases where the distribution factor is within the range | ||
| splitEvenlyForInt := func(minf, maxf float64, chunks *types.Set[types.Chunk], step float64) { | ||
| if minf+step > maxf { | ||
| chunks.Insert(types.Chunk{ | ||
| Min: nil, | ||
| Max: nil, | ||
|
||
| }) | ||
| return | ||
| } | ||
| prev := minf | ||
| for next := minf + step; next <= maxf; next += step { | ||
| chunks.Insert(types.Chunk{ | ||
| Min: utils.ConvertToString(prev), | ||
| Max: utils.ConvertToString(next), | ||
| }) | ||
| prev = next | ||
| } | ||
| chunks.Insert(types.Chunk{ | ||
| Min: utils.ConvertToString(prev), | ||
| Max: nil, | ||
| }) | ||
| } | ||
| if len(pkColumns) == 1 && isEvenDistribution { | ||
| splitEvenlyForInt(minFloat, maxFloat, chunks, float64(step)) | ||
|
||
| } else if len(pkColumns) > 0 { | ||
| err = splitViaPrimaryKey(stream, chunks) | ||
| } else { | ||
| err = limitOffsetChunking(chunks) | ||
|
|
@@ -193,3 +236,20 @@ func (m *MySQL) getTableExtremes(ctx context.Context, stream types.StreamInterfa | |
| err = tx.QueryRowContext(ctx, query).Scan(&min, &max) | ||
| return min, max, err | ||
| } | ||
|
|
||
vaibhav-datazip marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| func shouldUseEvenDistribution(minVal any, maxVal any, approxRowCount int64, chunkSize int64) (bool, int64, float64, float64) { | ||
|
||
| if approxRowCount == 0 { | ||
| return false, 0, 0, 0 | ||
| } | ||
| minFloat, err1 := typeutils.ReformatFloat64(minVal) | ||
|
||
| maxFloat, err2 := typeutils.ReformatFloat64(maxVal) | ||
| if err1 != nil || err2 != nil { | ||
| return false, 0, 0, 0 | ||
|
||
| } | ||
| distributionFactor := (maxFloat - minFloat + 1) / float64(approxRowCount) | ||
| if distributionFactor < constants.DistributionLower || distributionFactor > constants.DistributionUpper { | ||
| return false, 0, 0, 0 | ||
| } | ||
| step := int64(math.Max(distributionFactor*float64(chunkSize), 1)) | ||
| return true, step, minFloat, maxFloat | ||
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -429,6 +429,9 @@ func MysqlLimitOffsetScanQuery(stream types.StreamInterface, chunk types.Chunk, | |
| func MysqlChunkScanQuery(stream types.StreamInterface, filterColumns []string, chunk types.Chunk, extraFilter string) string { | ||
| condition := buildChunkConditionMySQL(filterColumns, chunk, extraFilter) | ||
| quotedTable := QuoteTable(stream.Namespace(), stream.Name(), constants.MySQL) | ||
| if condition == "" { | ||
| condition = utils.Ternary(extraFilter != "", extraFilter, "1 = 1").(string) | ||
| } | ||
|
||
| return fmt.Sprintf("SELECT * FROM %s WHERE %s", quotedTable, condition) | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how did we choose these distribution factors ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it is an assumption and airbyte used it as well