From 83ebf36b143c2dcfab0ba4bf5f0a7876ec94649f Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Tue, 27 Jan 2026 17:08:44 +0530 Subject: [PATCH 01/23] feat: mysql chunking optimization --- constants/constants.go | 4 + drivers/mysql/internal/backfill.go | 309 ++++++++++++++++++++--------- 2 files changed, 223 insertions(+), 90 deletions(-) diff --git a/constants/constants.go b/constants/constants.go index 3caf330ac..925291aab 100644 --- a/constants/constants.go +++ b/constants/constants.go @@ -34,6 +34,10 @@ const ( EffectiveParquetSize = int64(256) * 1024 * 1024 * int64(8) DB2StateTimestampFormat = "2006-01-02 15:04:05.000000" DefaultStateTimestampFormat = "2006-01-02T15:04:05.000000000Z" + // DistributionLower is the lower bound for distribution factor + DistributionLower = 0.05 + // DistributionUpper is the upper bound for distribution factor + DistributionUpper = 100.0 ) type DriverType string diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index 9ae829797..793fe9f90 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -41,20 +41,39 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, sort.Strings(pkColumns) logger.Debugf("Starting backfill from %v to %v with filter: %s, args: %v", chunk.Min, chunk.Max, filter, args) - // Get chunks from state or calculate new ones - stmt := "" - if chunkColumn != "" { - stmt = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) - } else if len(pkColumns) > 0 { - stmt = jdbc.MysqlChunkScanQuery(stream, pkColumns, chunk, filter) + // Get cxhunks from state or calculate new ones + var stmt string + + // FULL TABLE SCAN (prevents `WHERE ` SQL syntax error) + if chunk.Min == nil && chunk.Max == nil && filter == "" { + stmt = fmt.Sprintf( + "SELECT * FROM `%s`.`%s`", + stream.Namespace(), + stream.Name(), + ) } else { - stmt = jdbc.MysqlLimitOffsetScanQuery(stream, chunk, filter) + if chunkColumn != "" { + stmt = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) + } else if len(pkColumns) > 0 { + stmt = jdbc.MysqlChunkScanQuery(stream, pkColumns, chunk, filter) + } else { + stmt = jdbc.MysqlLimitOffsetScanQuery(stream, chunk, filter) + } } + logger.Debugf("Executing chunk query: %s", stmt) setter := jdbc.NewReader(ctx, stmt, func(ctx context.Context, query string, queryArgs ...any) (*sql.Rows, error) { return tx.QueryContext(ctx, query, args...) }) - return jdbc.MapScanConcurrent(setter, m.dataTypeConverter, OnMessage) + // Capture and process rows + return setter.Capture(func(rows *sql.Rows) error { + record := make(types.Record) + err := jdbc.MapScan(rows, record, m.dataTypeConverter) + if err != nil { + return fmt.Errorf("failed to scan record data as map: %s", err) + } + return OnMessage(ctx, record) + }) }) } @@ -93,103 +112,213 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo chunkSize := int64(math.Ceil(float64(constants.EffectiveParquetSize) / avgRowSizeFloat)) chunks := types.NewSet[types.Chunk]() chunkColumn := stream.Self().StreamMetadata.ChunkColumn - // Takes the user defined batch size as chunkSize - // TODO: common-out the chunking logic for db2, mssql, mysql - splitViaPrimaryKey := func(stream types.StreamInterface, chunks *types.Set[types.Chunk]) error { - return jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { - // Get primary key column using the provided function - pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array() - if chunkColumn != "" { - pkColumns = []string{chunkColumn} - } - sort.Strings(pkColumns) - // Get table extremes - minVal, maxVal, err := m.getTableExtremes(ctx, stream, pkColumns, tx) - if err != nil { - return fmt.Errorf("failed to get table extremes: %s", err) - } - if minVal == nil { - return nil - } - chunks.Insert(types.Chunk{ - Min: nil, - Max: utils.ConvertToString(minVal), - }) - logger.Infof("Stream %s extremes - min: %v, max: %v", stream.ID(), utils.ConvertToString(minVal), utils.ConvertToString(maxVal)) - - // Generate chunks based on range - query := jdbc.NextChunkEndQuery(stream, pkColumns, chunkSize) - currentVal := minVal - for { - // Split the current value into parts - columns := strings.Split(utils.ConvertToString(currentVal), ",") - - // Create args array with the correct number of arguments for the query - args := make([]interface{}, 0) - for columnIndex := 0; columnIndex < len(pkColumns); columnIndex++ { - // For each column combination in the WHERE clause, we need to add the necessary parts - for partIndex := 0; partIndex <= columnIndex && partIndex < len(columns); partIndex++ { - args = append(args, columns[partIndex]) - } - } - var nextValRaw interface{} - err := tx.QueryRowContext(ctx, query, args...).Scan(&nextValRaw) - if err == sql.ErrNoRows || nextValRaw == nil { - break - } else if err != nil { - return fmt.Errorf("failed to get next chunk end: %s", err) - } - if currentVal != nil && nextValRaw != nil { - chunks.Insert(types.Chunk{ - Min: utils.ConvertToString(currentVal), - Max: utils.ConvertToString(nextValRaw), - }) - } - currentVal = nextValRaw - } - if currentVal != nil { - chunks.Insert(types.Chunk{ - Min: utils.ConvertToString(currentVal), - Max: nil, - }) - } + logger.Infof("fghj %s", stream.ID()) + pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array() + if chunkColumn != "" { + pkColumns = []string{chunkColumn} + } + sort.Strings(pkColumns) + + // only meaningful for single-column PK + var ( + ok bool + step int64 + ) + var minVal, maxVal any - return nil + if stream.GetStream().SourceDefinedPrimaryKey.Len() > 0 || chunkColumn != "" { + err = jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { + + var err error + minVal, maxVal, err = m.getTableExtremes(ctx, stream, pkColumns, tx) + return err }) + if err != nil { + return nil, fmt.Errorf("failed to get table extremes: %s", err) + } + } + + if len(pkColumns) == 1 { + ok, step = shouldUseEvenDistribution(minVal, maxVal, approxRowCount, chunkSize) } - limitOffsetChunking := func(chunks *types.Set[types.Chunk]) error { - return jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { + + logger.Infof("fghjdbzjsbi %s", stream.ID()) + // EVEN distribution + if len(pkColumns) == 1 && ok { + logger.Infof("Splitting evenly for stream %s using step %d (min: %v, max: %v)", stream.ID(), step, minVal, maxVal) + splitEvenlyForInt(minVal, maxVal, chunks, step) + if err != nil { + return nil, fmt.Errorf("failed to split evenly: %s", err) + } + + } else if len(pkColumns) > 0 { + logger.Infof("Splitting via PK for stream %s", stream.ID()) + err = splitViaPrimaryKey( + ctx, + m, + stream, + chunks, + chunkColumn, + chunkSize, + minVal, + maxVal, + pkColumns, + ) + + //LIMIT / OFFSET fallback (no PK / no chunk column) + } else { + logger.Infof("Using Limit/Offset chunking for stream %s", stream.ID()) + err = limitOffsetChunking( + ctx, + m, + chunks, + chunkSize, + approxRowCount, + ) + } + logger.Infof("Generated %d chunks for stream %s", chunks.Len(), stream.ID()) + return chunks, err +} + +func (m *MySQL) getTableExtremes(ctx context.Context, stream types.StreamInterface, pkColumns []string, tx *sql.Tx) (min, max any, err error) { + query := jdbc.MinMaxQueryMySQL(stream, pkColumns) + err = tx.QueryRowContext(ctx, query).Scan(&min, &max) + return min, max, err +} + +func limitOffsetChunking(ctx context.Context, m *MySQL, chunks *types.Set[types.Chunk], chunkSize int64, approxRowCount int64) error { + return jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { + chunks.Insert(types.Chunk{ + Min: nil, + Max: utils.ConvertToString(chunkSize), + }) + lastChunk := chunkSize + for lastChunk < approxRowCount { chunks.Insert(types.Chunk{ - Min: nil, - Max: utils.ConvertToString(chunkSize), + Min: utils.ConvertToString(lastChunk), + Max: utils.ConvertToString(lastChunk + chunkSize), }) - lastChunk := chunkSize - for lastChunk < approxRowCount { + lastChunk += chunkSize + } + chunks.Insert(types.Chunk{ + Min: utils.ConvertToString(lastChunk), + Max: nil, + }) + return nil + }) +} + +func splitViaPrimaryKey(ctx context.Context, m *MySQL, stream types.StreamInterface, chunks *types.Set[types.Chunk], chunkColumn string, chunkSize int64, minVal any, maxVal any, pkColumns []string) error { + return jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { + chunks.Insert(types.Chunk{ + Min: nil, + Max: utils.ConvertToString(minVal), + }) + + logger.Infof("Stream %s extremes - min: %v, max: %v", stream.ID(), utils.ConvertToString(minVal), utils.ConvertToString(maxVal)) + + // Generate chunks based on range + query := jdbc.NextChunkEndQuery(stream, pkColumns, chunkSize) + currentVal := minVal + for { + // Split the current value into parts + columns := strings.Split(utils.ConvertToString(currentVal), ",") + + // Create args array with the correct number of arguments for the query + args := make([]interface{}, 0) + for columnIndex := 0; columnIndex < len(pkColumns); columnIndex++ { + // For each column combination in the WHERE clause, we need to add the necessary parts + for partIndex := 0; partIndex <= columnIndex && partIndex < len(columns); partIndex++ { + args = append(args, columns[partIndex]) + } + } + var nextValRaw interface{} + err := tx.QueryRowContext(ctx, query, args...).Scan(&nextValRaw) + if err == sql.ErrNoRows || nextValRaw == nil { + break + } else if err != nil { + return fmt.Errorf("failed to get next chunk end: %s", err) + } + if currentVal != nil && nextValRaw != nil { chunks.Insert(types.Chunk{ - Min: utils.ConvertToString(lastChunk), - Max: utils.ConvertToString(lastChunk + chunkSize), + Min: utils.ConvertToString(currentVal), + Max: utils.ConvertToString(nextValRaw), }) - lastChunk += chunkSize } + currentVal = nextValRaw + } + if currentVal != nil { chunks.Insert(types.Chunk{ - Min: utils.ConvertToString(lastChunk), + Min: utils.ConvertToString(currentVal), Max: nil, }) - return nil - }) + } + + return nil + }) +} + +func shouldUseEvenDistribution( + minVal any, + maxVal any, + approxRowCount int64, + chunkSize int64, +) (bool, int64) { + + if approxRowCount == 0 { + return false, 0 } - if stream.GetStream().SourceDefinedPrimaryKey.Len() > 0 || chunkColumn != "" { - err = splitViaPrimaryKey(stream, chunks) - } else { - err = limitOffsetChunking(chunks) + minF, err1 := typeutils.ReformatFloat64(minVal) + maxF, err2 := typeutils.ReformatFloat64(maxVal) + if err1 != nil || err2 != nil { + return false, 0 } - return chunks, err + if maxF < minF { + return false, 0 + } + // (max - min + 1) / rowCount(int64) + distributionFactor := (maxF - minF + 1) / float64(approxRowCount) + + // margin check + if distributionFactor < constants.DistributionLower || + distributionFactor > constants.DistributionUpper { + return false, 0 + } + + // dynamic chunk size or pk_steps + dynamicChunkSize := int64(math.Max(distributionFactor*float64(chunkSize), 1)) + + return true, dynamicChunkSize } -func (m *MySQL) getTableExtremes(ctx context.Context, stream types.StreamInterface, pkColumns []string, tx *sql.Tx) (min, max any, err error) { - query := jdbc.MinMaxQueryMySQL(stream, pkColumns) - err = tx.QueryRowContext(ctx, query).Scan(&min, &max) - return min, max, err +func splitEvenlyForInt(minVal, maxVal any, chunks *types.Set[types.Chunk], step int64) { + start, _ := typeutils.ReformatFloat64(minVal) + end, _ := typeutils.ReformatFloat64(maxVal) + logger.Infof("splitEvenlyForInt start=%v end=%v step=%d", start, end, step) + if start+float64(step) > end { + chunks.Insert(types.Chunk{ + Min: nil, + Max: nil, + }) + logger.Infof("Generated single chunk: %+v", chunks) + return + } + + prev := start + + for next := start + float64(step); next <= end; next += float64(step) { + chunks.Insert(types.Chunk{ + Min: utils.ConvertToString(prev), + Max: utils.ConvertToString(next), + }) + prev = next + } + + chunks.Insert(types.Chunk{ + Min: utils.ConvertToString(prev), + Max: nil, + }) + logger.Infof("Generated chunk: %+v", chunks) } From f5766f88b352ddf9306271d54b7833e8d9d8e197 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Tue, 27 Jan 2026 17:19:46 +0530 Subject: [PATCH 02/23] mysql optimization comment resolve --- drivers/mysql/internal/backfill.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index 793fe9f90..cbfadf147 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -113,7 +113,6 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo chunks := types.NewSet[types.Chunk]() chunkColumn := stream.Self().StreamMetadata.ChunkColumn - logger.Infof("fghj %s", stream.ID()) pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array() if chunkColumn != "" { pkColumns = []string{chunkColumn} @@ -124,8 +123,9 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo var ( ok bool step int64 + minVal any + maxVal any ) - var minVal, maxVal any if stream.GetStream().SourceDefinedPrimaryKey.Len() > 0 || chunkColumn != "" { err = jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { @@ -143,7 +143,6 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo ok, step = shouldUseEvenDistribution(minVal, maxVal, approxRowCount, chunkSize) } - logger.Infof("fghjdbzjsbi %s", stream.ID()) // EVEN distribution if len(pkColumns) == 1 && ok { logger.Infof("Splitting evenly for stream %s using step %d (min: %v, max: %v)", stream.ID(), step, minVal, maxVal) From c09aee84569f6675f78d4c2a271c63202c95cbf8 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Tue, 3 Feb 2026 12:00:36 +0530 Subject: [PATCH 03/23] chore: formatting fix --- drivers/mysql/internal/backfill.go | 305 +++++++++++------------------ 1 file changed, 119 insertions(+), 186 deletions(-) diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index cbfadf147..a3dd968fa 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -43,37 +43,21 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, logger.Debugf("Starting backfill from %v to %v with filter: %s, args: %v", chunk.Min, chunk.Max, filter, args) // Get cxhunks from state or calculate new ones var stmt string - - // FULL TABLE SCAN (prevents `WHERE ` SQL syntax error) if chunk.Min == nil && chunk.Max == nil && filter == "" { - stmt = fmt.Sprintf( - "SELECT * FROM `%s`.`%s`", - stream.Namespace(), - stream.Name(), - ) + stmt = fmt.Sprintf("SELECT * FROM `%s`.`%s`", stream.Namespace(), stream.Name()) + } else if chunkColumn != "" { + stmt = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) + } else if len(pkColumns) > 0 { + stmt = jdbc.MysqlChunkScanQuery(stream, pkColumns, chunk, filter) } else { - if chunkColumn != "" { - stmt = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) - } else if len(pkColumns) > 0 { - stmt = jdbc.MysqlChunkScanQuery(stream, pkColumns, chunk, filter) - } else { - stmt = jdbc.MysqlLimitOffsetScanQuery(stream, chunk, filter) - } + stmt = jdbc.MysqlLimitOffsetScanQuery(stream, chunk, filter) } logger.Debugf("Executing chunk query: %s", stmt) setter := jdbc.NewReader(ctx, stmt, func(ctx context.Context, query string, queryArgs ...any) (*sql.Rows, error) { return tx.QueryContext(ctx, query, args...) }) - // Capture and process rows - return setter.Capture(func(rows *sql.Rows) error { - record := make(types.Record) - err := jdbc.MapScan(rows, record, m.dataTypeConverter) - if err != nil { - return fmt.Errorf("failed to scan record data as map: %s", err) - } - return OnMessage(ctx, record) - }) + return jdbc.MapScanConcurrent(setter, m.dataTypeConverter, OnMessage) }) } @@ -113,23 +97,23 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo chunks := types.NewSet[types.Chunk]() chunkColumn := stream.Self().StreamMetadata.ChunkColumn + var ( + ok bool + step int64 + minVal any + maxVal any + minf float64 + maxf float64 + ) + pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array() if chunkColumn != "" { pkColumns = []string{chunkColumn} } sort.Strings(pkColumns) - // only meaningful for single-column PK - var ( - ok bool - step int64 - minVal any - maxVal any - ) - if stream.GetStream().SourceDefinedPrimaryKey.Len() > 0 || chunkColumn != "" { err = jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { - var err error minVal, maxVal, err = m.getTableExtremes(ctx, stream, pkColumns, tx) return err @@ -138,186 +122,135 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo return nil, fmt.Errorf("failed to get table extremes: %s", err) } } - if len(pkColumns) == 1 { - ok, step = shouldUseEvenDistribution(minVal, maxVal, approxRowCount, chunkSize) + ok, step, minf, maxf = shouldUseEvenDistribution(minVal, maxVal, approxRowCount, chunkSize) } - - // EVEN distribution - if len(pkColumns) == 1 && ok { - logger.Infof("Splitting evenly for stream %s using step %d (min: %v, max: %v)", stream.ID(), step, minVal, maxVal) - splitEvenlyForInt(minVal, maxVal, chunks, step) - if err != nil { - return nil, fmt.Errorf("failed to split evenly: %s", err) - } - - } else if len(pkColumns) > 0 { - logger.Infof("Splitting via PK for stream %s", stream.ID()) - err = splitViaPrimaryKey( - ctx, - m, - stream, - chunks, - chunkColumn, - chunkSize, - minVal, - maxVal, - pkColumns, - ) - - //LIMIT / OFFSET fallback (no PK / no chunk column) - } else { - logger.Infof("Using Limit/Offset chunking for stream %s", stream.ID()) - err = limitOffsetChunking( - ctx, - m, - chunks, - chunkSize, - approxRowCount, - ) - } - logger.Infof("Generated %d chunks for stream %s", chunks.Len(), stream.ID()) - return chunks, err -} - -func (m *MySQL) getTableExtremes(ctx context.Context, stream types.StreamInterface, pkColumns []string, tx *sql.Tx) (min, max any, err error) { - query := jdbc.MinMaxQueryMySQL(stream, pkColumns) - err = tx.QueryRowContext(ctx, query).Scan(&min, &max) - return min, max, err -} - -func limitOffsetChunking(ctx context.Context, m *MySQL, chunks *types.Set[types.Chunk], chunkSize int64, approxRowCount int64) error { - return jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { - chunks.Insert(types.Chunk{ - Min: nil, - Max: utils.ConvertToString(chunkSize), - }) - lastChunk := chunkSize - for lastChunk < approxRowCount { + // Takes the user defined batch size as chunkSize + // TODO: common-out the chunking logic for db2, mssql, mysql + splitViaPrimaryKey := func(stream types.StreamInterface, chunks *types.Set[types.Chunk]) error { + return jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { + if minVal == nil { + return nil + } chunks.Insert(types.Chunk{ - Min: utils.ConvertToString(lastChunk), - Max: utils.ConvertToString(lastChunk + chunkSize), + Min: nil, + Max: utils.ConvertToString(minVal), }) - lastChunk += chunkSize - } - chunks.Insert(types.Chunk{ - Min: utils.ConvertToString(lastChunk), - Max: nil, - }) - return nil - }) -} -func splitViaPrimaryKey(ctx context.Context, m *MySQL, stream types.StreamInterface, chunks *types.Set[types.Chunk], chunkColumn string, chunkSize int64, minVal any, maxVal any, pkColumns []string) error { - return jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { - chunks.Insert(types.Chunk{ - Min: nil, - Max: utils.ConvertToString(minVal), - }) - - logger.Infof("Stream %s extremes - min: %v, max: %v", stream.ID(), utils.ConvertToString(minVal), utils.ConvertToString(maxVal)) - - // Generate chunks based on range - query := jdbc.NextChunkEndQuery(stream, pkColumns, chunkSize) - currentVal := minVal - for { - // Split the current value into parts - columns := strings.Split(utils.ConvertToString(currentVal), ",") - - // Create args array with the correct number of arguments for the query - args := make([]interface{}, 0) - for columnIndex := 0; columnIndex < len(pkColumns); columnIndex++ { - // For each column combination in the WHERE clause, we need to add the necessary parts - for partIndex := 0; partIndex <= columnIndex && partIndex < len(columns); partIndex++ { - args = append(args, columns[partIndex]) + logger.Infof("Stream %s extremes - min: %v, max: %v", stream.ID(), utils.ConvertToString(minVal), utils.ConvertToString(maxVal)) + + // Generate chunks based on range + query := jdbc.NextChunkEndQuery(stream, pkColumns, chunkSize) + currentVal := minVal + for { + // Split the current value into parts + columns := strings.Split(utils.ConvertToString(currentVal), ",") + + // Create args array with the correct number of arguments for the query + args := make([]interface{}, 0) + for columnIndex := 0; columnIndex < len(pkColumns); columnIndex++ { + // For each column combination in the WHERE clause, we need to add the necessary parts + for partIndex := 0; partIndex <= columnIndex && partIndex < len(columns); partIndex++ { + args = append(args, columns[partIndex]) + } } + var nextValRaw interface{} + err := tx.QueryRowContext(ctx, query, args...).Scan(&nextValRaw) + if err == sql.ErrNoRows || nextValRaw == nil { + break + } else if err != nil { + return fmt.Errorf("failed to get next chunk end: %s", err) + } + if currentVal != nil && nextValRaw != nil { + chunks.Insert(types.Chunk{ + Min: utils.ConvertToString(currentVal), + Max: utils.ConvertToString(nextValRaw), + }) + } + currentVal = nextValRaw } - var nextValRaw interface{} - err := tx.QueryRowContext(ctx, query, args...).Scan(&nextValRaw) - if err == sql.ErrNoRows || nextValRaw == nil { - break - } else if err != nil { - return fmt.Errorf("failed to get next chunk end: %s", err) - } - if currentVal != nil && nextValRaw != nil { + if currentVal != nil { chunks.Insert(types.Chunk{ Min: utils.ConvertToString(currentVal), - Max: utils.ConvertToString(nextValRaw), + Max: nil, }) } - currentVal = nextValRaw - } - if currentVal != nil { + + return nil + }) + } + limitOffsetChunking := func(chunks *types.Set[types.Chunk]) error { + return jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { + chunks.Insert(types.Chunk{ + Min: nil, + Max: utils.ConvertToString(chunkSize), + }) + lastChunk := chunkSize + for lastChunk < approxRowCount { + chunks.Insert(types.Chunk{ + Min: utils.ConvertToString(lastChunk), + Max: utils.ConvertToString(lastChunk + chunkSize), + }) + lastChunk += chunkSize + } chunks.Insert(types.Chunk{ - Min: utils.ConvertToString(currentVal), + Min: utils.ConvertToString(lastChunk), Max: nil, }) + return nil + }) + } + splitEvenlyForInt := func(minf, maxf float64, chunks *types.Set[types.Chunk], step float64) { + if minf+step > maxf { + chunks.Insert(types.Chunk{ + Min: nil, + Max: nil, + }) + return } - - return nil - }) + prev := minf + for next := minf + step; next <= maxf; next += step { + chunks.Insert(types.Chunk{ + Min: utils.ConvertToString(prev), + Max: utils.ConvertToString(next), + }) + prev = next + } + chunks.Insert(types.Chunk{ + Min: utils.ConvertToString(prev), + Max: nil, + }) + } + if len(pkColumns) == 1 && ok { + splitEvenlyForInt(minf, maxf, chunks, float64(step)) + } else if len(pkColumns) > 0 { + err = splitViaPrimaryKey(stream, chunks) + } else { + err = limitOffsetChunking(chunks) + } + return chunks, err } -func shouldUseEvenDistribution( - minVal any, - maxVal any, - approxRowCount int64, - chunkSize int64, -) (bool, int64) { +func (m *MySQL) getTableExtremes(ctx context.Context, stream types.StreamInterface, pkColumns []string, tx *sql.Tx) (min, max any, err error) { + query := jdbc.MinMaxQueryMySQL(stream, pkColumns) + err = tx.QueryRowContext(ctx, query).Scan(&min, &max) + return min, max, err +} +func shouldUseEvenDistribution(minVal any, maxVal any, approxRowCount int64, chunkSize int64) (bool, int64, float64, float64) { if approxRowCount == 0 { - return false, 0 + return false, 0, 0, 0 } - minF, err1 := typeutils.ReformatFloat64(minVal) maxF, err2 := typeutils.ReformatFloat64(maxVal) if err1 != nil || err2 != nil { - return false, 0 + return false, 0, 0, 0 } - if maxF < minF { - return false, 0 - } - // (max - min + 1) / rowCount(int64) distributionFactor := (maxF - minF + 1) / float64(approxRowCount) - - // margin check if distributionFactor < constants.DistributionLower || distributionFactor > constants.DistributionUpper { - return false, 0 + return false, 0, 0, 0 } - - // dynamic chunk size or pk_steps - dynamicChunkSize := int64(math.Max(distributionFactor*float64(chunkSize), 1)) - - return true, dynamicChunkSize -} - -func splitEvenlyForInt(minVal, maxVal any, chunks *types.Set[types.Chunk], step int64) { - start, _ := typeutils.ReformatFloat64(minVal) - end, _ := typeutils.ReformatFloat64(maxVal) - logger.Infof("splitEvenlyForInt start=%v end=%v step=%d", start, end, step) - if start+float64(step) > end { - chunks.Insert(types.Chunk{ - Min: nil, - Max: nil, - }) - logger.Infof("Generated single chunk: %+v", chunks) - return - } - - prev := start - - for next := start + float64(step); next <= end; next += float64(step) { - chunks.Insert(types.Chunk{ - Min: utils.ConvertToString(prev), - Max: utils.ConvertToString(next), - }) - prev = next - } - - chunks.Insert(types.Chunk{ - Min: utils.ConvertToString(prev), - Max: nil, - }) - logger.Infof("Generated chunk: %+v", chunks) + step := int64(math.Max(distributionFactor*float64(chunkSize), 1)) + return true, step, minF, maxF } From 53520dec34a03de1501a04ee3901b39ade57f937 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Tue, 3 Feb 2026 15:16:00 +0530 Subject: [PATCH 04/23] my-sql-chunking-formatting-resolved --- drivers/mysql/internal/backfill.go | 35 +++++++++++++++--------------- pkg/jdbc/jdbc.go | 3 +++ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index a3dd968fa..64f61389b 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -43,9 +43,7 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, logger.Debugf("Starting backfill from %v to %v with filter: %s, args: %v", chunk.Min, chunk.Max, filter, args) // Get cxhunks from state or calculate new ones var stmt string - if chunk.Min == nil && chunk.Max == nil && filter == "" { - stmt = fmt.Sprintf("SELECT * FROM `%s`.`%s`", stream.Namespace(), stream.Name()) - } else if chunkColumn != "" { + if chunkColumn != "" { stmt = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) } else if len(pkColumns) > 0 { stmt = jdbc.MysqlChunkScanQuery(stream, pkColumns, chunk, filter) @@ -98,12 +96,12 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo chunkColumn := stream.Self().StreamMetadata.ChunkColumn var ( - ok bool - step int64 - minVal any - maxVal any - minf float64 - maxf float64 + isEvenDistribution bool + step int64 + minVal any //to define lower range of the chunk + maxVal any //to define upper range of the chunk + minFloat float64 + maxFloat float64 ) pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array() @@ -123,7 +121,7 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo } } if len(pkColumns) == 1 { - ok, step, minf, maxf = shouldUseEvenDistribution(minVal, maxVal, approxRowCount, chunkSize) + isEvenDistribution, step, minFloat, maxFloat = shouldUseEvenDistribution(minVal, maxVal, approxRowCount, chunkSize) } // Takes the user defined batch size as chunkSize // TODO: common-out the chunking logic for db2, mssql, mysql @@ -200,6 +198,8 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo return nil }) } + + //used mathematical calculation to split the chunks for cases where the distribution factor is within the range splitEvenlyForInt := func(minf, maxf float64, chunks *types.Set[types.Chunk], step float64) { if minf+step > maxf { chunks.Insert(types.Chunk{ @@ -221,8 +221,8 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo Max: nil, }) } - if len(pkColumns) == 1 && ok { - splitEvenlyForInt(minf, maxf, chunks, float64(step)) + if len(pkColumns) == 1 && isEvenDistribution { + splitEvenlyForInt(minFloat, maxFloat, chunks, float64(step)) } else if len(pkColumns) > 0 { err = splitViaPrimaryKey(stream, chunks) } else { @@ -241,16 +241,15 @@ func shouldUseEvenDistribution(minVal any, maxVal any, approxRowCount int64, chu if approxRowCount == 0 { return false, 0, 0, 0 } - minF, err1 := typeutils.ReformatFloat64(minVal) - maxF, err2 := typeutils.ReformatFloat64(maxVal) + minFloat, err1 := typeutils.ReformatFloat64(minVal) + maxFloat, err2 := typeutils.ReformatFloat64(maxVal) if err1 != nil || err2 != nil { return false, 0, 0, 0 } - distributionFactor := (maxF - minF + 1) / float64(approxRowCount) - if distributionFactor < constants.DistributionLower || - distributionFactor > constants.DistributionUpper { + distributionFactor := (maxFloat - minFloat + 1) / float64(approxRowCount) + if distributionFactor < constants.DistributionLower || distributionFactor > constants.DistributionUpper { return false, 0, 0, 0 } step := int64(math.Max(distributionFactor*float64(chunkSize), 1)) - return true, step, minF, maxF + return true, step, minFloat, maxFloat } diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go index 214a4251e..f72d339c1 100644 --- a/pkg/jdbc/jdbc.go +++ b/pkg/jdbc/jdbc.go @@ -429,6 +429,9 @@ func MysqlLimitOffsetScanQuery(stream types.StreamInterface, chunk types.Chunk, func MysqlChunkScanQuery(stream types.StreamInterface, filterColumns []string, chunk types.Chunk, extraFilter string) string { condition := buildChunkConditionMySQL(filterColumns, chunk, extraFilter) quotedTable := QuoteTable(stream.Namespace(), stream.Name(), constants.MySQL) + if condition == "" { + condition = utils.Ternary(extraFilter != "", extraFilter, "1 = 1").(string) + } return fmt.Sprintf("SELECT * FROM %s WHERE %s", quotedTable, condition) } From 3b9fbe7aa0df8f0148c3080fb7d9535305d2668c Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Tue, 3 Feb 2026 15:22:54 +0530 Subject: [PATCH 05/23] mysql-chunking-self-reviewed --- drivers/mysql/internal/backfill.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index 64f61389b..6b60a4348 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -41,7 +41,7 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, sort.Strings(pkColumns) logger.Debugf("Starting backfill from %v to %v with filter: %s, args: %v", chunk.Min, chunk.Max, filter, args) - // Get cxhunks from state or calculate new ones + // Get chunks from state or calculate new ones var stmt string if chunkColumn != "" { stmt = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) @@ -50,7 +50,6 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, } else { stmt = jdbc.MysqlLimitOffsetScanQuery(stream, chunk, filter) } - logger.Debugf("Executing chunk query: %s", stmt) setter := jdbc.NewReader(ctx, stmt, func(ctx context.Context, query string, queryArgs ...any) (*sql.Rows, error) { return tx.QueryContext(ctx, query, args...) From 8e4ba6aefeb10ddbaac4bf3e0acdc7e596b8a1c6 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Sat, 7 Feb 2026 20:48:40 +0530 Subject: [PATCH 06/23] mysql-chunking-optimization-for-string-pk --- constants/constants.go | 1 + drivers/mysql/internal/backfill.go | 93 ++++++++++++++++++++++++++---- pkg/jdbc/jdbc.go | 13 +++++ 3 files changed, 96 insertions(+), 11 deletions(-) diff --git a/constants/constants.go b/constants/constants.go index 925291aab..a760af5a4 100644 --- a/constants/constants.go +++ b/constants/constants.go @@ -38,6 +38,7 @@ const ( DistributionLower = 0.05 // DistributionUpper is the upper bound for distribution factor DistributionUpper = 100.0 + UnicodeSize = 1114112 ) type DriverType string diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index 6b60a4348..82aece8fa 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -5,6 +5,7 @@ import ( "database/sql" "fmt" "math" + "math/big" "sort" "strings" @@ -94,11 +95,19 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo chunks := types.NewSet[types.Chunk]() chunkColumn := stream.Self().StreamMetadata.ChunkColumn + var avgSchemaSize int64 + avgSchemaSizeQuery := jdbc.MySQLTableSizeQuery() + err = m.client.QueryRowContext(ctx, avgSchemaSizeQuery, stream.Name()).Scan(&avgSchemaSize) + if err != nil { + return nil, fmt.Errorf("failed to get avg schema size: %s", err) + } + expectedChunks := (avgSchemaSize + chunkSize - 1) / chunkSize + var ( isEvenDistribution bool step int64 - minVal any //to define lower range of the chunk - maxVal any //to define upper range of the chunk + minVal any //to define lower range of the chunk + maxVal any //to define upper range of the chunk minFloat float64 maxFloat float64 ) @@ -198,15 +207,12 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo }) } - //used mathematical calculation to split the chunks for cases where the distribution factor is within the range + //used mathematical calculation to split the chunks for cases where the distribution factor is within the range when pk is numeric splitEvenlyForInt := func(minf, maxf float64, chunks *types.Set[types.Chunk], step float64) { - if minf+step > maxf { - chunks.Insert(types.Chunk{ - Min: nil, - Max: nil, - }) - return - } + chunks.Insert(types.Chunk{ + Min: nil, + Max: utils.ConvertToString(minf), + }) prev := minf for next := minf + step; next <= maxf; next += step { chunks.Insert(types.Chunk{ @@ -220,9 +226,49 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo Max: nil, }) } + + //used mathematical calculation to split the chunks for cases where the pk columns size is 1 and pk data type is string + splitEvenlyForString := func(minVal, maxVal any, expectedChunks int64) { + maxValBaseN, err1 := convertStringToIntBaseN(utils.ConvertToString(maxVal)) + minValBaseN, err2 := convertStringToIntBaseN(utils.ConvertToString(minVal)) + if err1 != nil || err2 != nil { + return + } + if expectedChunks <= 0 { + expectedChunks = 1 + } + maxCopy := new(big.Int).Set(&maxValBaseN) + chunkdiff := maxValBaseN.Sub(maxCopy, &minValBaseN) + chunkdiff.Div(chunkdiff, big.NewInt(expectedChunks)) + if chunkdiff.Cmp(big.NewInt(0)) == 0 { + chunks.Insert(types.Chunk{ + Min: nil, + Max: nil, + }) + return + } + prev := &minValBaseN + chunks.Insert(types.Chunk{ + Min: nil, + Max: *convertIntBaseNtoString(prev), + }) + for next := new(big.Int).Add(prev, chunkdiff); next.Cmp(&maxValBaseN) < 0; next.Add(next, chunkdiff) { + chunks.Insert(types.Chunk{ + Min: *convertIntBaseNtoString(prev), + Max: *convertIntBaseNtoString(next), + }) + prev = new(big.Int).Set(next) + } + chunks.Insert(types.Chunk{ + Min: *convertIntBaseNtoString(prev), + Max: nil, + }) + } if len(pkColumns) == 1 && isEvenDistribution { splitEvenlyForInt(minFloat, maxFloat, chunks, float64(step)) - } else if len(pkColumns) > 0 { + } else if len(pkColumns) == 1 { + splitEvenlyForString(minVal, maxVal, expectedChunks) + } else if len(pkColumns) > 1 { err = splitViaPrimaryKey(stream, chunks) } else { err = limitOffsetChunking(chunks) @@ -252,3 +298,28 @@ func shouldUseEvenDistribution(minVal any, maxVal any, approxRowCount int64, chu step := int64(math.Max(distributionFactor*float64(chunkSize), 1)) return true, step, minFloat, maxFloat } + +// convert a string to a baseN number +func convertStringToIntBaseN(s string) (big.Int, error) { + base := big.NewInt(constants.UnicodeSize) + val := big.NewInt(0) + + for _, ch := range []rune(s) { + val.Mul(val, base) + val.Add(val, big.NewInt(int64(ch))) + } + return *val, nil +} + +// convert a baseN number to a string pointer +func convertIntBaseNtoString(n *big.Int) *string { + ans := "" + base := big.NewInt(constants.UnicodeSize) + x := new(big.Int).Set(n) + for x.Cmp(big.NewInt(0)) > 0 { + rem := new(big.Int).Mod(x, base) + ans = string(rune(rem.Int64())) + ans + x.Div(x, base) + } + return &ans +} diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go index f72d339c1..dd0f47e5b 100644 --- a/pkg/jdbc/jdbc.go +++ b/pkg/jdbc/jdbc.go @@ -512,6 +512,19 @@ func MySQLTableRowStatsQuery() string { ` } +//MySQLTABLESizeQuery returns the query to fetch the size of a table in MySQL +func MySQLTableSizeQuery() string { + return ` + SELECT + DATA_LENGTH + INDEX_LENGTH AS table_size + FROM + INFORMATION_SCHEMA.TABLES + WHERE + TABLE_SCHEMA = DATABASE() + AND TABLE_NAME = ? + ` +} + // MySQLTableExistsQuery returns the query to check if a table has any rows using EXISTS func MySQLTableExistsQuery(stream types.StreamInterface) string { quotedTable := QuoteTable(stream.Namespace(), stream.Name(), constants.MySQL) From ccfb371aaf3efaf8d7129348d9e770e611a0ed20 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Mon, 9 Feb 2026 13:39:59 +0530 Subject: [PATCH 07/23] feat: solved lint issue --- drivers/mssql/internal/incremental.go | 2 +- drivers/mssql/main.go | 1 - drivers/mysql/internal/config.go | 2 +- pkg/jdbc/jdbc.go | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/mssql/internal/incremental.go b/drivers/mssql/internal/incremental.go index 237a8a59c..dca23183f 100644 --- a/drivers/mssql/internal/incremental.go +++ b/drivers/mssql/internal/incremental.go @@ -50,4 +50,4 @@ func (m *MSSQL) FetchMaxCursorValues(ctx context.Context, stream types.StreamInt return nil, nil, err } return maxPrimaryCursorValue, maxSecondaryCursorValue, nil -} \ No newline at end of file +} diff --git a/drivers/mssql/main.go b/drivers/mssql/main.go index 8d80455ae..bc8c26f5c 100644 --- a/drivers/mssql/main.go +++ b/drivers/mssql/main.go @@ -10,4 +10,3 @@ func main() { defer driver.Close() olake.RegisterDriver(driver) } - diff --git a/drivers/mysql/internal/config.go b/drivers/mysql/internal/config.go index 55995eda8..b1fd93250 100644 --- a/drivers/mysql/internal/config.go +++ b/drivers/mysql/internal/config.go @@ -63,7 +63,7 @@ func (c *Config) URI() (string, error) { if err != nil { return "", fmt.Errorf("failed to build TLS config: %s", err) } - + tlsConfigName := "mysql_" + utils.ULID() if err := mysql.RegisterTLSConfig(tlsConfigName, tlsConfig); err != nil { return "", fmt.Errorf("failed to register TLS config: %s", err) diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go index ce573872b..ee9b399f8 100644 --- a/pkg/jdbc/jdbc.go +++ b/pkg/jdbc/jdbc.go @@ -512,7 +512,7 @@ func MySQLTableRowStatsQuery() string { ` } -//MySQLTABLESizeQuery returns the query to fetch the size of a table in MySQL +// MySQLTABLESizeQuery returns the query to fetch the size of a table in MySQL func MySQLTableSizeQuery() string { return ` SELECT From 910246ad3a3111744f9c0284d04c78ea95a62a55 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Tue, 10 Feb 2026 15:20:50 +0530 Subject: [PATCH 08/23] feat: mysql chunking optimization review resolved --- drivers/mssql/main.go | 1 + drivers/mysql/internal/backfill.go | 149 +++++++++++++++++------------ pkg/jdbc/jdbc.go | 19 +--- 3 files changed, 89 insertions(+), 80 deletions(-) diff --git a/drivers/mssql/main.go b/drivers/mssql/main.go index bc8c26f5c..8d80455ae 100644 --- a/drivers/mssql/main.go +++ b/drivers/mssql/main.go @@ -10,3 +10,4 @@ func main() { defer driver.Close() olake.RegisterDriver(driver) } + diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index 82aece8fa..a956150a9 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -6,6 +6,7 @@ import ( "fmt" "math" "math/big" + "slices" "sort" "strings" @@ -62,8 +63,9 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPool, stream types.StreamInterface) (*types.Set[types.Chunk], error) { var approxRowCount int64 var avgRowSize any + var avgSchemaSize int64 approxRowCountQuery := jdbc.MySQLTableRowStatsQuery() - err := m.client.QueryRowContext(ctx, approxRowCountQuery, stream.Name()).Scan(&approxRowCount, &avgRowSize) + err := m.client.QueryRowContext(ctx, approxRowCountQuery, stream.Name()).Scan(&approxRowCount, &avgRowSize, &avgSchemaSize) if err != nil { return nil, fmt.Errorf("failed to get approx row count and avg row size: %s", err) } @@ -95,21 +97,13 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo chunks := types.NewSet[types.Chunk]() chunkColumn := stream.Self().StreamMetadata.ChunkColumn - var avgSchemaSize int64 - avgSchemaSizeQuery := jdbc.MySQLTableSizeQuery() - err = m.client.QueryRowContext(ctx, avgSchemaSizeQuery, stream.Name()).Scan(&avgSchemaSize) - if err != nil { - return nil, fmt.Errorf("failed to get avg schema size: %s", err) - } - expectedChunks := (avgSchemaSize + chunkSize - 1) / chunkSize - var ( - isEvenDistribution bool - step int64 - minVal any //to define lower range of the chunk - maxVal any //to define upper range of the chunk - minFloat float64 - maxFloat float64 + isNumericAndEvenDistributed bool + step int64 + minVal any //to define lower range of the chunk + maxVal any //to define upper range of the chunk + minFloat float64 + maxFloat float64 ) pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array() @@ -118,18 +112,17 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo } sort.Strings(pkColumns) - if stream.GetStream().SourceDefinedPrimaryKey.Len() > 0 || chunkColumn != "" { - err = jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error { - var err error - minVal, maxVal, err = m.getTableExtremes(ctx, stream, pkColumns, tx) - return err - }) + if len(pkColumns) > 0 || chunkColumn != "" { + minVal, maxVal, err = m.getTableExtremes(ctx, stream, pkColumns) if err != nil { - return nil, fmt.Errorf("failed to get table extremes: %s", err) + logger.Debugf("Stream %s: Failed to get table extremes: %v", stream.ID(), err) } } if len(pkColumns) == 1 { - isEvenDistribution, step, minFloat, maxFloat = shouldUseEvenDistribution(minVal, maxVal, approxRowCount, chunkSize) + isNumericAndEvenDistributed, step, minFloat, maxFloat, err = IsNumericAndEvenDistributed(minVal, maxVal, approxRowCount, chunkSize) + if err != nil { + logger.Debugf("Stream %s: PK is not numeric or conversion failed, falling back to string splitting: %v", stream.ID(), err) + } } // Takes the user defined batch size as chunkSize // TODO: common-out the chunking logic for db2, mssql, mysql @@ -208,13 +201,13 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo } //used mathematical calculation to split the chunks for cases where the distribution factor is within the range when pk is numeric - splitEvenlyForInt := func(minf, maxf float64, chunks *types.Set[types.Chunk], step float64) { + splitEvenlyForInt := func(chunks *types.Set[types.Chunk], step float64) { chunks.Insert(types.Chunk{ Min: nil, - Max: utils.ConvertToString(minf), + Max: utils.ConvertToString(minFloat), }) - prev := minf - for next := minf + step; next <= maxf; next += step { + prev := minFloat + for next := minFloat + step; next <= maxFloat; next += step { chunks.Insert(types.Chunk{ Min: utils.ConvertToString(prev), Max: utils.ConvertToString(next), @@ -228,79 +221,105 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo } //used mathematical calculation to split the chunks for cases where the pk columns size is 1 and pk data type is string - splitEvenlyForString := func(minVal, maxVal any, expectedChunks int64) { - maxValBaseN, err1 := convertStringToIntBaseN(utils.ConvertToString(maxVal)) - minValBaseN, err2 := convertStringToIntBaseN(utils.ConvertToString(minVal)) - if err1 != nil || err2 != nil { - return + splitEvenlyForString := func(chunks *types.Set[types.Chunk]) error { + var maxValBaseN, minValBaseN big.Int + + err := utils.ConcurrentF( + ctx, + func(ctx context.Context) error { + val, err := convertUnicodeStringToInt(utils.ConvertToString(maxVal)) + maxValBaseN.Set(&val) + return err + }, + func(ctx context.Context) error { + val, err := convertUnicodeStringToInt(utils.ConvertToString(minVal)) + minValBaseN.Set(&val) + return err + }, + ) + if err != nil { + return fmt.Errorf("failed to convert string to int: %v", err) } + + expectedChunks := (avgSchemaSize + chunkSize - 1) / chunkSize if expectedChunks <= 0 { expectedChunks = 1 } - maxCopy := new(big.Int).Set(&maxValBaseN) - chunkdiff := maxValBaseN.Sub(maxCopy, &minValBaseN) + chunkdiff := new(big.Int).Sub(&maxValBaseN, &minValBaseN) chunkdiff.Div(chunkdiff, big.NewInt(expectedChunks)) - if chunkdiff.Cmp(big.NewInt(0)) == 0 { - chunks.Insert(types.Chunk{ - Min: nil, - Max: nil, - }) - return - } prev := &minValBaseN chunks.Insert(types.Chunk{ Min: nil, - Max: *convertIntBaseNtoString(prev), + Max: convertIntUnicodeToString(prev), }) for next := new(big.Int).Add(prev, chunkdiff); next.Cmp(&maxValBaseN) < 0; next.Add(next, chunkdiff) { + var minStr, maxStr string + _ = utils.ConcurrentF(ctx, + func(ctx context.Context) error { + minStr = convertIntUnicodeToString(prev) + return nil + }, + func(ctx context.Context) error { + maxStr = convertIntUnicodeToString(next) + return nil + }, + ) chunks.Insert(types.Chunk{ - Min: *convertIntBaseNtoString(prev), - Max: *convertIntBaseNtoString(next), + Min: minStr, + Max: maxStr, }) prev = new(big.Int).Set(next) } chunks.Insert(types.Chunk{ - Min: *convertIntBaseNtoString(prev), + Min: convertIntUnicodeToString(prev), Max: nil, }) + return nil } - if len(pkColumns) == 1 && isEvenDistribution { - splitEvenlyForInt(minFloat, maxFloat, chunks, float64(step)) - } else if len(pkColumns) == 1 { - splitEvenlyForString(minVal, maxVal, expectedChunks) - } else if len(pkColumns) > 1 { + + switch { + case len(pkColumns) == 1 && isNumericAndEvenDistributed: + splitEvenlyForInt(chunks, float64(step)) + case len(pkColumns) == 1: + err = splitEvenlyForString(chunks) + case len(pkColumns) > 1: err = splitViaPrimaryKey(stream, chunks) - } else { + default: err = limitOffsetChunking(chunks) } return chunks, err } -func (m *MySQL) getTableExtremes(ctx context.Context, stream types.StreamInterface, pkColumns []string, tx *sql.Tx) (min, max any, err error) { +func (m *MySQL) getTableExtremes(ctx context.Context, stream types.StreamInterface, pkColumns []string) (min, max any, err error) { query := jdbc.MinMaxQueryMySQL(stream, pkColumns) - err = tx.QueryRowContext(ctx, query).Scan(&min, &max) + err = m.client.QueryRowContext(ctx, query).Scan(&min, &max) return min, max, err } -func shouldUseEvenDistribution(minVal any, maxVal any, approxRowCount int64, chunkSize int64) (bool, int64, float64, float64) { +// checks if the pk column is numeric and evenly distributed +func IsNumericAndEvenDistributed(minVal any, maxVal any, approxRowCount int64, chunkSize int64) (bool, int64, float64, float64, error) { if approxRowCount == 0 { - return false, 0, 0, 0 + return false, 0, 0, 0, nil } minFloat, err1 := typeutils.ReformatFloat64(minVal) maxFloat, err2 := typeutils.ReformatFloat64(maxVal) if err1 != nil || err2 != nil { - return false, 0, 0, 0 + if err1 != nil { + return false, 0, 0, 0, err1 + } + return false, 0, 0, 0, err2 } distributionFactor := (maxFloat - minFloat + 1) / float64(approxRowCount) if distributionFactor < constants.DistributionLower || distributionFactor > constants.DistributionUpper { - return false, 0, 0, 0 + err := fmt.Errorf("distribution factor is not in the range of %f to %f", constants.DistributionLower, constants.DistributionUpper) + return false, 0, 0, 0, err } step := int64(math.Max(distributionFactor*float64(chunkSize), 1)) - return true, step, minFloat, maxFloat + return true, step, minFloat, maxFloat, nil } // convert a string to a baseN number -func convertStringToIntBaseN(s string) (big.Int, error) { +func convertUnicodeStringToInt(s string) (big.Int, error) { base := big.NewInt(constants.UnicodeSize) val := big.NewInt(0) @@ -312,14 +331,18 @@ func convertStringToIntBaseN(s string) (big.Int, error) { } // convert a baseN number to a string pointer -func convertIntBaseNtoString(n *big.Int) *string { - ans := "" +func convertIntUnicodeToString(n *big.Int) string { + if n.Cmp(big.NewInt(0)) == 0 { + return "" + } base := big.NewInt(constants.UnicodeSize) x := new(big.Int).Set(n) + var runes []rune for x.Cmp(big.NewInt(0)) > 0 { rem := new(big.Int).Mod(x, base) - ans = string(rune(rem.Int64())) + ans + runes = append(runes, rune(rem.Int64())) x.Div(x, base) } - return &ans + slices.Reverse(runes) + return string(runes) } diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go index ee9b399f8..e2546ad08 100644 --- a/pkg/jdbc/jdbc.go +++ b/pkg/jdbc/jdbc.go @@ -429,9 +429,6 @@ func MysqlLimitOffsetScanQuery(stream types.StreamInterface, chunk types.Chunk, func MysqlChunkScanQuery(stream types.StreamInterface, filterColumns []string, chunk types.Chunk, extraFilter string) string { condition := buildChunkConditionMySQL(filterColumns, chunk, extraFilter) quotedTable := QuoteTable(stream.Namespace(), stream.Name(), constants.MySQL) - if condition == "" { - condition = utils.Ternary(extraFilter != "", extraFilter, "1 = 1").(string) - } return fmt.Sprintf("SELECT * FROM %s WHERE %s", quotedTable, condition) } @@ -505,26 +502,14 @@ func MySQLPrimaryKeyQuery() string { func MySQLTableRowStatsQuery() string { return ` SELECT TABLE_ROWS, - CEIL(data_length / NULLIF(table_rows, 0)) AS avg_row_bytes + CEIL(data_length / NULLIF(table_rows, 0)) AS avg_row_bytes, + DATA_LENGTH FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ? ` } -// MySQLTABLESizeQuery returns the query to fetch the size of a table in MySQL -func MySQLTableSizeQuery() string { - return ` - SELECT - DATA_LENGTH + INDEX_LENGTH AS table_size - FROM - INFORMATION_SCHEMA.TABLES - WHERE - TABLE_SCHEMA = DATABASE() - AND TABLE_NAME = ? - ` -} - // MySQLTableExistsQuery returns the query to check if a table has any rows using EXISTS func MySQLTableExistsQuery(stream types.StreamInterface) string { quotedTable := QuoteTable(stream.Namespace(), stream.Name(), constants.MySQL) From 1eacf5acd0a893430d133577447072e0d7406b37 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Tue, 10 Feb 2026 15:26:07 +0530 Subject: [PATCH 09/23] feat: resolving-lint-extra-spaces --- drivers/mssql/internal/incremental.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mssql/internal/incremental.go b/drivers/mssql/internal/incremental.go index dca23183f..237a8a59c 100644 --- a/drivers/mssql/internal/incremental.go +++ b/drivers/mssql/internal/incremental.go @@ -50,4 +50,4 @@ func (m *MSSQL) FetchMaxCursorValues(ctx context.Context, stream types.StreamInt return nil, nil, err } return maxPrimaryCursorValue, maxSecondaryCursorValue, nil -} +} \ No newline at end of file From 964a2ee4f9f678d63c49ba38c53bb6235fc6bebf Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Tue, 10 Feb 2026 15:37:03 +0530 Subject: [PATCH 10/23] feat: lint error resolved --- drivers/mysql/internal/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mysql/internal/config.go b/drivers/mysql/internal/config.go index b1fd93250..55995eda8 100644 --- a/drivers/mysql/internal/config.go +++ b/drivers/mysql/internal/config.go @@ -63,7 +63,7 @@ func (c *Config) URI() (string, error) { if err != nil { return "", fmt.Errorf("failed to build TLS config: %s", err) } - + tlsConfigName := "mysql_" + utils.ULID() if err := mysql.RegisterTLSConfig(tlsConfigName, tlsConfig); err != nil { return "", fmt.Errorf("failed to register TLS config: %s", err) From 11a9f03eb9aeccaeb64515fd8259fa9745535a94 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Tue, 10 Feb 2026 15:57:17 +0530 Subject: [PATCH 11/23] feat: self-reviewed --- drivers/mysql/internal/backfill.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index a956150a9..7b98164cb 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -44,7 +44,7 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, logger.Debugf("Starting backfill from %v to %v with filter: %s, args: %v", chunk.Min, chunk.Max, filter, args) // Get chunks from state or calculate new ones - var stmt string + stmt := "" if chunkColumn != "" { stmt = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) } else if len(pkColumns) > 0 { From 94a6fd80ed7534dde3c5c607a79a2a47d96d7aa8 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Mon, 16 Feb 2026 18:37:23 +0530 Subject: [PATCH 12/23] feat: recommiting the logical issue in mysql chunking calculation --- drivers/mysql/internal/backfill.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index 7b98164cb..5bfd2520f 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -241,7 +241,7 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo return fmt.Errorf("failed to convert string to int: %v", err) } - expectedChunks := (avgSchemaSize + chunkSize - 1) / chunkSize + expectedChunks := int64(math.Ceil(float64(avgSchemaSize) / float64(constants.EffectiveParquetSize))) if expectedChunks <= 0 { expectedChunks = 1 } From 6e5e82f93831b6ff557c173409ef28a0c078be5d Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Tue, 24 Feb 2026 13:14:57 +0530 Subject: [PATCH 13/23] feat: added splitviaprimarykey function --- constants/constants.go | 5 +- drivers/mysql/internal/backfill.go | 169 ++++++++++++++++++++++------- pkg/jdbc/jdbc.go | 109 +++++++++++++++---- 3 files changed, 217 insertions(+), 66 deletions(-) diff --git a/constants/constants.go b/constants/constants.go index a1ccbb939..f431f18df 100644 --- a/constants/constants.go +++ b/constants/constants.go @@ -36,8 +36,9 @@ const ( // DistributionLower is the lower bound for distribution factor DistributionLower = 0.05 // DistributionUpper is the upper bound for distribution factor - DistributionUpper = 100.0 - UnicodeSize = 1114112 + DistributionUpper = 100.0 + UnicodeSize = 1114112 + MysqlChunkSizeReductionFactor = float64(0.8) ) type DriverType string diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index 5bfd2520f..61c6acdc7 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -9,6 +9,7 @@ import ( "slices" "sort" "strings" + "unicode/utf8" "github.com/datazip-inc/olake/constants" "github.com/datazip-inc/olake/destination" @@ -45,13 +46,15 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, logger.Debugf("Starting backfill from %v to %v with filter: %s, args: %v", chunk.Min, chunk.Max, filter, args) // Get chunks from state or calculate new ones stmt := "" + var chunkArgs []any if chunkColumn != "" { - stmt = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) + stmt, chunkArgs = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) } else if len(pkColumns) > 0 { - stmt = jdbc.MysqlChunkScanQuery(stream, pkColumns, chunk, filter) + stmt, chunkArgs = jdbc.MysqlChunkScanQuery(stream, pkColumns, chunk, filter) } else { stmt = jdbc.MysqlLimitOffsetScanQuery(stream, chunk, filter) } + args = append(chunkArgs, args...) logger.Debugf("Executing chunk query: %s", stmt) setter := jdbc.NewReader(ctx, stmt, func(ctx context.Context, query string, queryArgs ...any) (*sql.Rows, error) { return tx.QueryContext(ctx, query, args...) @@ -64,10 +67,12 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo var approxRowCount int64 var avgRowSize any var avgSchemaSize int64 + var tableCollationType string + var dataMaxLength sql.NullInt64 approxRowCountQuery := jdbc.MySQLTableRowStatsQuery() - err := m.client.QueryRowContext(ctx, approxRowCountQuery, stream.Name()).Scan(&approxRowCount, &avgRowSize, &avgSchemaSize) + err := m.client.QueryRowContext(ctx, approxRowCountQuery, stream.Name()).Scan(&approxRowCount, &avgRowSize, &avgSchemaSize, &tableCollationType) if err != nil { - return nil, fmt.Errorf("failed to get approx row count and avg row size: %s", err) + return nil, fmt.Errorf("failed to fetch RowStats query for table=%s: %v", stream.Name(), err) } if approxRowCount == 0 { @@ -118,11 +123,34 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo logger.Debugf("Stream %s: Failed to get table extremes: %v", stream.ID(), err) } } + // Supported MySQL string-like PK datatypes + var stringTypes = map[string]struct{}{ + "char": {}, + "varchar": {}, + } + //defining boolean to check if string is supported or not + stringSupportedPk := false + if len(pkColumns) == 1 { isNumericAndEvenDistributed, step, minFloat, maxFloat, err = IsNumericAndEvenDistributed(minVal, maxVal, approxRowCount, chunkSize) if err != nil { + isNumericAndEvenDistributed = false logger.Debugf("Stream %s: PK is not numeric or conversion failed, falling back to string splitting: %v", stream.ID(), err) } + var dataType string + query := jdbc.MySQLColumnTypeQuery() + err = m.client.QueryRowContext(ctx, query, stream.Name(), pkColumns[0]).Scan(&dataType, &dataMaxLength) + if err != nil { + logger.Errorf("failed to fetch Column DataType and max length %s", err) + } else { + if _, ok := stringTypes[dataType]; ok { + stringSupportedPk = true + fmt.Println("This is a string type PK") + } + if dataMaxLength.Valid { + fmt.Println("Data Max Length:", dataMaxLength.Int64) + } + } } // Takes the user defined batch size as chunkSize // TODO: common-out the chunking logic for db2, mssql, mysql @@ -223,22 +251,22 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo //used mathematical calculation to split the chunks for cases where the pk columns size is 1 and pk data type is string splitEvenlyForString := func(chunks *types.Set[types.Chunk]) error { var maxValBaseN, minValBaseN big.Int - - err := utils.ConcurrentF( - ctx, - func(ctx context.Context) error { - val, err := convertUnicodeStringToInt(utils.ConvertToString(maxVal)) - maxValBaseN.Set(&val) - return err - }, - func(ctx context.Context) error { - val, err := convertUnicodeStringToInt(utils.ConvertToString(minVal)) - minValBaseN.Set(&val) - return err - }, - ) - if err != nil { - return fmt.Errorf("failed to convert string to int: %v", err) + var validChunksCount int + maxValPadded := utils.ConvertToString(maxVal) + minValPadded := utils.ConvertToString(minVal) + if dataMaxLength.Valid { + maxValPadded = padRightNull(maxValPadded, int(dataMaxLength.Int64)) + minValPadded = padRightNull(minValPadded, int(dataMaxLength.Int64)) + } + if val, err := convertUnicodeStringToInt(maxValPadded); err != nil { + return fmt.Errorf("failed to convert maxVal: %v", err) + } else { + maxValBaseN.Set(&val) + } + if val, err := convertUnicodeStringToInt(minValPadded); err != nil { + return fmt.Errorf("failed to convert minVal: %v", err) + } else { + minValBaseN.Set(&val) } expectedChunks := int64(math.Ceil(float64(avgSchemaSize) / float64(constants.EffectiveParquetSize))) @@ -246,46 +274,97 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo expectedChunks = 1 } chunkdiff := new(big.Int).Sub(&maxValBaseN, &minValBaseN) - chunkdiff.Div(chunkdiff, big.NewInt(expectedChunks)) - prev := &minValBaseN + chunkdiff.Add(chunkdiff, new(big.Int).Sub(big.NewInt(expectedChunks), big.NewInt(1))) + chunkdiff.Div(chunkdiff, big.NewInt(expectedChunks)) //ceil division set up + rangeSlice := []string{} + for i := int64(0); i < int64(5); i++ { + temporarychunkdiff := new(big.Int).Set(chunkdiff) + temporarychunkdiff.Add(temporarychunkdiff, big.NewInt(i)) + temporarychunkdiff.Div(temporarychunkdiff, big.NewInt(i+1)) + curr := new(big.Int).Set(&minValBaseN) + for j := int64(0); j < expectedChunks && curr.Cmp(&maxValBaseN) < 0; j++ { + rangeSlice = append(rangeSlice, convertIntUnicodeToString(curr)) + curr.Add(curr, temporarychunkdiff) + } + rangeSlice = append(rangeSlice, convertIntUnicodeToString(&maxValBaseN)) + query, args := jdbc.MySQLDistinctValuesWithCollationQuery(rangeSlice, tableCollationType) + rows, err := m.client.QueryContext(ctx, query, args...) + if err != nil { + return fmt.Errorf("failed to run distinct query: %v", err) + } + rangeSlice = rangeSlice[:0] + for rows.Next() { + var val string + if err := rows.Scan(&val); err != nil { + logger.Errorf("failed to scan row: %v", err) + } + rangeSlice = append(rangeSlice, val) + } + rows.Close() + query, args = jdbc.MySQLCountGeneratedInRange(rangeSlice, tableCollationType, minValPadded, maxValPadded) + err = m.client.QueryRowContext(ctx, query, args...).Scan(&validChunksCount) + if err != nil { + return fmt.Errorf("failed to run count query: %v", err) + } + if float64(validChunksCount) >= float64(expectedChunks)*constants.MysqlChunkSizeReductionFactor { + logger.Debug("Successfully Generated Chunks") + for i, val := range rangeSlice { + logger.Debugf("Boundary[%d] = %q", i, val) + } + break + } + if float64(validChunksCount) < float64(expectedChunks)*constants.MysqlChunkSizeReductionFactor && i == 4 { + logger.Warnf("failed to generate chunks for stream %s, falling back to primary key chunking", stream.ID()) + err = splitViaPrimaryKey(stream, chunks) + if err != nil { + return fmt.Errorf("failed to generate chunks for stream %s: %v", stream.ID(), err) + } + return nil + } + rangeSlice = rangeSlice[:0] + } + if len(rangeSlice) == 0 { + return nil + } + prev := rangeSlice[0] chunks.Insert(types.Chunk{ Min: nil, - Max: convertIntUnicodeToString(prev), + Max: prev, }) - for next := new(big.Int).Add(prev, chunkdiff); next.Cmp(&maxValBaseN) < 0; next.Add(next, chunkdiff) { - var minStr, maxStr string - _ = utils.ConcurrentF(ctx, - func(ctx context.Context) error { - minStr = convertIntUnicodeToString(prev) - return nil - }, - func(ctx context.Context) error { - maxStr = convertIntUnicodeToString(next) - return nil - }, - ) + for idx := range rangeSlice { + if idx == 0 { + continue + } + currVal := rangeSlice[idx] chunks.Insert(types.Chunk{ - Min: minStr, - Max: maxStr, + Min: prev, + Max: currVal, }) - prev = new(big.Int).Set(next) + prev = currVal } chunks.Insert(types.Chunk{ - Min: convertIntUnicodeToString(prev), + Min: prev, Max: nil, }) return nil } - switch { case len(pkColumns) == 1 && isNumericAndEvenDistributed: + logger.Debugf("Using splitEvenlyForInt Method for stream %s", stream.ID()) splitEvenlyForInt(chunks, float64(step)) - case len(pkColumns) == 1: + logger.Debugf("Chunking completed using splitEvenlyForInt Method for stream %s", stream.ID()) + case len(pkColumns) == 1 && stringSupportedPk: + logger.Debugf("Using splitEvenlyForString Method for stream %s", stream.ID()) err = splitEvenlyForString(chunks) + logger.Debugf("Chunking completed using splitEvenlyForString Method for stream %s", stream.ID()) case len(pkColumns) > 1: + logger.Debugf("Using SplitViaPrimaryKey Method for stream %s", stream.ID()) err = splitViaPrimaryKey(stream, chunks) + logger.Debugf("Chunking completed using SplitViaPrimaryKey Method for stream %s", stream.ID()) default: + logger.Debugf("Falling back to limit offset method for stream %s", stream.ID()) err = limitOffsetChunking(chunks) + logger.Debugf("Chunking completed using limit offset method for stream %s", stream.ID()) } return chunks, err } @@ -346,3 +425,11 @@ func convertIntUnicodeToString(n *big.Int) string { slices.Reverse(runes) return string(runes) } + +func padRightNull(s string, maxLength int) string { + length := utf8.RuneCountInString(s) + if length >= maxLength { + return s + } + return s + strings.Repeat("\x00", maxLength-length) +} diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go index a8d6ad39f..a6ecf0a27 100644 --- a/pkg/jdbc/jdbc.go +++ b/pkg/jdbc/jdbc.go @@ -232,31 +232,28 @@ func PostgresChunkScanQuery(stream types.StreamInterface, filterColumn string, c // MySQL-Specific Queries // buildChunkConditionMySQL builds the condition for a chunk in MySQL -func buildChunkConditionMySQL(filterColumns []string, chunk types.Chunk, extraFilter string) string { +func buildChunkConditionMySQL(filterColumns []string, chunk types.Chunk, extraFilter string) (string, []any) { quotedCols := QuoteColumns(filterColumns, constants.MySQL) colTuple := "(" + strings.Join(quotedCols, ", ") + ")" - buildSQLTuple := func(val any) string { - parts := strings.Split(val.(string), ",") - for i, part := range parts { - parts[i] = fmt.Sprintf("'%s'", strings.TrimSpace(part)) - } - return strings.Join(parts, ", ") + var conditions []string + var args []any + if chunk.Min != nil { + conditions = append(conditions, fmt.Sprintf("%s >= (?)", colTuple)) + args = append(args, chunk.Min) } - chunkCond := "" - switch { - case chunk.Min != nil && chunk.Max != nil: - chunkCond = fmt.Sprintf("%s >= (%s) AND %s < (%s)", colTuple, buildSQLTuple(chunk.Min), colTuple, buildSQLTuple(chunk.Max)) - case chunk.Min != nil: - chunkCond = fmt.Sprintf("%s >= (%s)", colTuple, buildSQLTuple(chunk.Min)) - case chunk.Max != nil: - chunkCond = fmt.Sprintf("%s < (%s)", colTuple, buildSQLTuple(chunk.Max)) + + if chunk.Max != nil { + conditions = append(conditions, fmt.Sprintf("%s < (?)", colTuple)) + args = append(args, chunk.Max) } - // Both filter and chunk cond both should exist + + chunkCond := strings.Join(conditions, " AND ") + if extraFilter != "" && chunkCond != "" { - return fmt.Sprintf("(%s) AND (%s)", chunkCond, extraFilter) + chunkCond = fmt.Sprintf("(%s) AND (%s)", chunkCond, extraFilter) } - return chunkCond + return chunkCond, args } // buildChunkConditionMSSQL builds a WHERE condition for scanning a chunk in MSSQL. @@ -426,10 +423,10 @@ func MysqlLimitOffsetScanQuery(stream types.StreamInterface, chunk types.Chunk, } // MySQLWithoutState builds a chunk scan query for MySql -func MysqlChunkScanQuery(stream types.StreamInterface, filterColumns []string, chunk types.Chunk, extraFilter string) string { - condition := buildChunkConditionMySQL(filterColumns, chunk, extraFilter) +func MysqlChunkScanQuery(stream types.StreamInterface, filterColumns []string, chunk types.Chunk, extraFilter string) (string, []any) { + condition, args := buildChunkConditionMySQL(filterColumns, chunk, extraFilter) quotedTable := QuoteTable(stream.Namespace(), stream.Name(), constants.MySQL) - return fmt.Sprintf("SELECT * FROM %s WHERE %s", quotedTable, condition) + return fmt.Sprintf("SELECT * FROM %s WHERE %s", quotedTable, condition), args } // MinMaxQueryMySQL returns the query to fetch MIN and MAX values of a column in a MySQL table @@ -498,18 +495,84 @@ func MySQLPrimaryKeyQuery() string { ` } -// MySQLTableRowStatsQuery returns the query to fetch the estimated row count and average row size of a table in MySQL +// MySQLTableRowStatsQuery returns the query to fetch the estimated row count, average row size, table size and table collation of a table in MySQL func MySQLTableRowStatsQuery() string { return ` SELECT TABLE_ROWS, CEIL(data_length / NULLIF(table_rows, 0)) AS avg_row_bytes, - DATA_LENGTH + DATA_LENGTH, + TABLE_COLLATION FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ? ` } +func MySQLColumnTypeQuery() string { + return ` + SELECT DATA_TYPE ,CHARACTER_MAXIMUM_LENGTH + FROM INFORMATION_SCHEMA.COLUMNS + WHERE TABLE_SCHEMA = DATABASE() + AND TABLE_NAME = ? + AND COLUMN_NAME = ? + LIMIT 1; + ` +} + +// MySQLDistinctValuesWithCollationQuery builds a DISTINCT query over a slice of strings +// using the table's collation type. +func MySQLDistinctValuesWithCollationQuery(values []string, tableCollationType string) (string, []any) { + if len(values) == 0 { + return "", nil + } + unionParts := make([]string, 0, len(values)) + args := make([]any, 0, len(values)) + for _, v := range values { + unionParts = append(unionParts, "SELECT ? AS val") + args = append(args, v) + } + query := fmt.Sprintf(` + SELECT DISTINCT val COLLATE %s AS val + FROM ( + %s + ) AS t + ORDER BY val COLLATE %s; + `, tableCollationType, strings.Join(unionParts, "\nUNION ALL\n"), tableCollationType) + return query, args +} + +func MySQLCountGeneratedInRange(values []string, tableCollationType string, minVal, maxVal string) (string, []any) { + if len(values) == 0 { + return "", nil + } + + unionParts := make([]string, 0, len(values)) + args := make([]any, 0, len(values)+2) + + for _, v := range values { + unionParts = append(unionParts, "SELECT ? AS val") + args = append(args, v) + } + + args = append(args, minVal, maxVal, maxVal, minVal) + + query := fmt.Sprintf(` + SELECT GREATEST( + SUM(CASE + WHEN val COLLATE %s >= ? AND val COLLATE %s <= ? + THEN 1 ELSE 0 END), + SUM(CASE + WHEN val COLLATE %s >= ? AND val COLLATE %s <= ? + THEN 1 ELSE 0 END) + ) AS max_count + FROM ( + %s + ) AS t; + `, tableCollationType, tableCollationType, tableCollationType, tableCollationType, strings.Join(unionParts, "\nUNION ALL\n")) + + return query, args +} + // MySQLTableExistsQuery returns the query to check if a table has any rows using EXISTS func MySQLTableExistsQuery(stream types.StreamInterface) string { quotedTable := QuoteTable(stream.Namespace(), stream.Name(), constants.MySQL) From 69a1714563439bfa30902a5590f581c16bcdf221 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Sat, 28 Feb 2026 17:54:51 +0530 Subject: [PATCH 14/23] chore: improved formatting and resolved calculating validationcount querry --- drivers/mysql/internal/backfill.go | 221 +++++++++++++++++++++-------- pkg/jdbc/jdbc.go | 4 +- 2 files changed, 163 insertions(+), 62 deletions(-) diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index 61c6acdc7..e611d85ea 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -64,11 +64,15 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, } func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPool, stream types.StreamInterface) (*types.Set[types.Chunk], error) { - var approxRowCount int64 - var avgRowSize any - var avgSchemaSize int64 - var tableCollationType string - var dataMaxLength sql.NullInt64 + + var ( + approxRowCount int64 + avgRowSize any + avgSchemaSize int64 + tableCollationType string + dataMaxLength sql.NullInt64 + ) + approxRowCountQuery := jdbc.MySQLTableRowStatsQuery() err := m.client.QueryRowContext(ctx, approxRowCountQuery, stream.Name()).Scan(&approxRowCount, &avgRowSize, &avgSchemaSize, &tableCollationType) if err != nil { @@ -104,7 +108,7 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo var ( isNumericAndEvenDistributed bool - step int64 + chunkStepSize int64 minVal any //to define lower range of the chunk maxVal any //to define upper range of the chunk minFloat float64 @@ -117,10 +121,10 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo } sort.Strings(pkColumns) - if len(pkColumns) > 0 || chunkColumn != "" { + if len(pkColumns) > 0 { minVal, maxVal, err = m.getTableExtremes(ctx, stream, pkColumns) if err != nil { - logger.Debugf("Stream %s: Failed to get table extremes: %v", stream.ID(), err) + return nil, fmt.Errorf("Stream %s: Failed to get table extremes: %v", stream.ID(), err) } } // Supported MySQL string-like PK datatypes @@ -132,26 +136,27 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo stringSupportedPk := false if len(pkColumns) == 1 { - isNumericAndEvenDistributed, step, minFloat, maxFloat, err = IsNumericAndEvenDistributed(minVal, maxVal, approxRowCount, chunkSize) + isNumericAndEvenDistributed, chunkStepSize, minFloat, maxFloat, err = IsNumericAndEvenDistributed(minVal, maxVal, approxRowCount, chunkSize) if err != nil { isNumericAndEvenDistributed = false - logger.Debugf("Stream %s: PK is not numeric or conversion failed, falling back to string splitting: %v", stream.ID(), err) + logger.Infof("Stream %s: PK is not numeric or conversion failed, falling back to string splitting: %v", stream.ID(), err) } - var dataType string - query := jdbc.MySQLColumnTypeQuery() - err = m.client.QueryRowContext(ctx, query, stream.Name(), pkColumns[0]).Scan(&dataType, &dataMaxLength) - if err != nil { - logger.Errorf("failed to fetch Column DataType and max length %s", err) - } else { - if _, ok := stringTypes[dataType]; ok { + if !isNumericAndEvenDistributed { + var dataType string + query := jdbc.MySQLColumnTypeQuery() + err = m.client.QueryRowContext(ctx, query, stream.Name(), pkColumns[0]).Scan(&dataType, &dataMaxLength) + if err != nil { + return nil, fmt.Errorf("failed to fetch Column DataType and max length %s", err) + } else if _, ok := stringTypes[dataType]; ok { stringSupportedPk = true - fmt.Println("This is a string type PK") - } - if dataMaxLength.Valid { - fmt.Println("Data Max Length:", dataMaxLength.Int64) + logger.Infof("%s is a string type PK",pkColumns[0]) + if dataMaxLength.Valid { + logger.Infof("Data Max Length: %d", dataMaxLength.Int64) + } } } } + // Takes the user defined batch size as chunkSize // TODO: common-out the chunking logic for db2, mssql, mysql splitViaPrimaryKey := func(stream types.StreamInterface, chunks *types.Set[types.Chunk]) error { @@ -203,6 +208,7 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo }) } + logger.Infof("Chunking completed using SplitViaPrimaryKey Method for stream %s", stream.ID()) return nil }) } @@ -224,18 +230,33 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo Min: utils.ConvertToString(lastChunk), Max: nil, }) + logger.Infof("Chunking completed using limit offset method for stream %s", stream.ID()) return nil }) } - //used mathematical calculation to split the chunks for cases where the distribution factor is within the range when pk is numeric - splitEvenlyForInt := func(chunks *types.Set[types.Chunk], step float64) { + /* + splitEvenlyForInt generates chunk boundaries for numeric values by dividing the range [minFloat, maxFloat] using an arithmetic progression (AP). + + Each boundary follows: + next = prev + chunkStepSize + + Example: + minFloat = 0, maxFloat = 100, chunkStepSize = 25 + + AP sequence: + 0 → 25 → 50 → 75 → 100 + + Chunks formed: + (-∞, 0), [0,25), [25,50), [50,75), [75,100), [100, +∞) + */ + splitEvenlyForInt := func(chunks *types.Set[types.Chunk], chunkStepSize float64) { chunks.Insert(types.Chunk{ Min: nil, Max: utils.ConvertToString(minFloat), }) prev := minFloat - for next := minFloat + step; next <= maxFloat; next += step { + for next := minFloat + chunkStepSize; next <= maxFloat; next += chunkStepSize { chunks.Insert(types.Chunk{ Min: utils.ConvertToString(prev), Max: utils.ConvertToString(next), @@ -246,46 +267,77 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo Min: utils.ConvertToString(prev), Max: nil, }) + logger.Infof("Chunking completed using splitEvenlyForInt Method for stream %s", stream.ID()) } - //used mathematical calculation to split the chunks for cases where the pk columns size is 1 and pk data type is string + /* + splitEvenlyForString generates chunk boundaries for string-based primary keys + by converting string values into a numeric (big.Int) space and iteratively + splitting that range. + + Workflow: + 1. Convert min and max string values into padded form and map them into big.Int using unicode-based encoding. + 2. Estimate the expected number of chunks based on schema size and target file size. + 3. Compute an initial chunk interval using ceil division on the numeric range. + 4. Iteratively (up to 5 attempts): + - Adjust the interval using an AP-based variation. + - Generate candidate boundaries in numeric space and map them back to strings. + - Query distinct values using collation-aware SQL (ordering handled in query). + - Validate the number of effective chunks using a count query. + - If at least the required threshold (~80%) of chunks is achieved, accept and stop. + 5. If all attempts fail, fallback to primary key–based chunking. + + Final Step: + - Use the validated boundary values to construct non-overlapping chunks + covering the full range [min, max], including open-ended boundaries. + + Example: + minVal = "aa", maxVal = "az", expectedChunks = 3 + + Generated boundaries after refining boundaries using collation-aware DB queries: + ["aa", "ai", "ar", "az"] + + Chunks: + (-∞, "aa"), ["aa","ai"), ["ai","ar"), ["ar","az"), ["az", +∞) + */ splitEvenlyForString := func(chunks *types.Set[types.Chunk]) error { var maxValBaseN, minValBaseN big.Int var validChunksCount int + maxValPadded := utils.ConvertToString(maxVal) minValPadded := utils.ConvertToString(minVal) + if dataMaxLength.Valid { maxValPadded = padRightNull(maxValPadded, int(dataMaxLength.Int64)) minValPadded = padRightNull(minValPadded, int(dataMaxLength.Int64)) } - if val, err := convertUnicodeStringToInt(maxValPadded); err != nil { - return fmt.Errorf("failed to convert maxVal: %v", err) - } else { - maxValBaseN.Set(&val) - } - if val, err := convertUnicodeStringToInt(minValPadded); err != nil { - return fmt.Errorf("failed to convert minVal: %v", err) - } else { - minValBaseN.Set(&val) - } + + val1 := convertUnicodeStringToInt(maxValPadded) + maxValBaseN.Set(&val1) + val2 := convertUnicodeStringToInt(minValPadded) + minValBaseN.Set(&val2) expectedChunks := int64(math.Ceil(float64(avgSchemaSize) / float64(constants.EffectiveParquetSize))) if expectedChunks <= 0 { expectedChunks = 1 } + chunkdiff := new(big.Int).Sub(&maxValBaseN, &minValBaseN) chunkdiff.Add(chunkdiff, new(big.Int).Sub(big.NewInt(expectedChunks), big.NewInt(1))) chunkdiff.Div(chunkdiff, big.NewInt(expectedChunks)) //ceil division set up + rangeSlice := []string{} for i := int64(0); i < int64(5); i++ { temporarychunkdiff := new(big.Int).Set(chunkdiff) temporarychunkdiff.Add(temporarychunkdiff, big.NewInt(i)) temporarychunkdiff.Div(temporarychunkdiff, big.NewInt(i+1)) curr := new(big.Int).Set(&minValBaseN) + for j := int64(0); j < expectedChunks && curr.Cmp(&maxValBaseN) < 0; j++ { rangeSlice = append(rangeSlice, convertIntUnicodeToString(curr)) curr.Add(curr, temporarychunkdiff) } + rangeSlice = append(rangeSlice, convertIntUnicodeToString(&maxValBaseN)) query, args := jdbc.MySQLDistinctValuesWithCollationQuery(rangeSlice, tableCollationType) rows, err := m.client.QueryContext(ctx, query, args...) @@ -293,28 +345,32 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo return fmt.Errorf("failed to run distinct query: %v", err) } rangeSlice = rangeSlice[:0] + for rows.Next() { var val string if err := rows.Scan(&val); err != nil { - logger.Errorf("failed to scan row: %v", err) + return fmt.Errorf("failed to scan row: %v", err) } rangeSlice = append(rangeSlice, val) } rows.Close() + query, args = jdbc.MySQLCountGeneratedInRange(rangeSlice, tableCollationType, minValPadded, maxValPadded) err = m.client.QueryRowContext(ctx, query, args...).Scan(&validChunksCount) if err != nil { return fmt.Errorf("failed to run count query: %v", err) } + if float64(validChunksCount) >= float64(expectedChunks)*constants.MysqlChunkSizeReductionFactor { - logger.Debug("Successfully Generated Chunks") + logger.Infof("Successfully Generated Chunks using splitEvenlyForString Method for stream %s", stream.ID()) for i, val := range rangeSlice { logger.Debugf("Boundary[%d] = %q", i, val) } break } + if float64(validChunksCount) < float64(expectedChunks)*constants.MysqlChunkSizeReductionFactor && i == 4 { - logger.Warnf("failed to generate chunks for stream %s, falling back to primary key chunking", stream.ID()) + logger.Warnf("failed to generate chunks for stream %s, falling back to splitviaprimarykey method", stream.ID()) err = splitViaPrimaryKey(stream, chunks) if err != nil { return fmt.Errorf("failed to generate chunks for stream %s: %v", stream.ID(), err) @@ -323,14 +379,17 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo } rangeSlice = rangeSlice[:0] } + if len(rangeSlice) == 0 { return nil } + prev := rangeSlice[0] chunks.Insert(types.Chunk{ Min: nil, Max: prev, }) + for idx := range rangeSlice { if idx == 0 { continue @@ -342,30 +401,34 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo }) prev = currVal } + chunks.Insert(types.Chunk{ Min: prev, Max: nil, }) + + logger.Infof("Chunking completed using splitEvenlyForString Method for stream %s", stream.ID()) return nil } + switch { case len(pkColumns) == 1 && isNumericAndEvenDistributed: - logger.Debugf("Using splitEvenlyForInt Method for stream %s", stream.ID()) - splitEvenlyForInt(chunks, float64(step)) - logger.Debugf("Chunking completed using splitEvenlyForInt Method for stream %s", stream.ID()) + logger.Infof("Using splitEvenlyForInt Method for stream %s", stream.ID()) + splitEvenlyForInt(chunks, float64(chunkStepSize)) + case len(pkColumns) == 1 && stringSupportedPk: - logger.Debugf("Using splitEvenlyForString Method for stream %s", stream.ID()) + logger.Infof("Using splitEvenlyForString Method for stream %s", stream.ID()) err = splitEvenlyForString(chunks) - logger.Debugf("Chunking completed using splitEvenlyForString Method for stream %s", stream.ID()) + case len(pkColumns) > 1: - logger.Debugf("Using SplitViaPrimaryKey Method for stream %s", stream.ID()) + logger.Infof("Using SplitViaPrimaryKey Method for stream %s", stream.ID()) err = splitViaPrimaryKey(stream, chunks) - logger.Debugf("Chunking completed using SplitViaPrimaryKey Method for stream %s", stream.ID()) + default: - logger.Debugf("Falling back to limit offset method for stream %s", stream.ID()) + logger.Infof("Falling back to limit offset method for stream %s", stream.ID()) err = limitOffsetChunking(chunks) - logger.Debugf("Chunking completed using limit offset method for stream %s", stream.ID()) } + return chunks, err } @@ -380,25 +443,42 @@ func IsNumericAndEvenDistributed(minVal any, maxVal any, approxRowCount int64, c if approxRowCount == 0 { return false, 0, 0, 0, nil } + minFloat, err1 := typeutils.ReformatFloat64(minVal) + if err1 != nil { + return false, 0, 0, 0, fmt.Errorf("failed to parse minVal: %w", err1) + } + maxFloat, err2 := typeutils.ReformatFloat64(maxVal) - if err1 != nil || err2 != nil { - if err1 != nil { - return false, 0, 0, 0, err1 - } - return false, 0, 0, 0, err2 + if err2 != nil { + return false, 0, 0, 0, fmt.Errorf("failed to parse maxVal: %w", err2) } + distributionFactor := (maxFloat - minFloat + 1) / float64(approxRowCount) + if distributionFactor < constants.DistributionLower || distributionFactor > constants.DistributionUpper { - err := fmt.Errorf("distribution factor is not in the range of %f to %f", constants.DistributionLower, constants.DistributionUpper) - return false, 0, 0, 0, err + return false, 0, 0, 0, fmt.Errorf("distribution factor is not in the range of %f to %f", constants.DistributionLower, constants.DistributionUpper) } - step := int64(math.Max(distributionFactor*float64(chunkSize), 1)) - return true, step, minFloat, maxFloat, nil + + chunkStepSize := int64(math.Max(distributionFactor*float64(chunkSize), 1)) + return true, chunkStepSize, minFloat, maxFloat, nil } -// convert a string to a baseN number -func convertUnicodeStringToInt(s string) (big.Int, error) { +/* + convertUnicodeStringToInt maps a string to a big.Int using base = 1114112(UnicodeSize), treating each rune as a digit in a positional system. + + Value = r₀*base^(n-1) + r₁*base^(n-2) + ... + rₙ + + Example: + s = "aa" + r₀ = 'a' = 97, r₁ = 'a' = 97, base = 1114112 + + Value = r₀*base^(n-1) + r₁*base^(n-2) + = 97*1114112 + 97 + = 108068961 + +*/ +func convertUnicodeStringToInt(s string) big.Int { base := big.NewInt(constants.UnicodeSize) val := big.NewInt(0) @@ -406,10 +486,29 @@ func convertUnicodeStringToInt(s string) (big.Int, error) { val.Mul(val, base) val.Add(val, big.NewInt(int64(ch))) } - return *val, nil + return *val } -// convert a baseN number to a string pointer +/* + convertIntUnicodeToString reconstructs the original string from its big.Int representation by extracting digits in base = 1114112 (UnicodeSize). + + It repeatedly takes modulus and division by base to recover each rune: + rᵢ = n % base, then n = n / base + + Example: + n = 108068961, base = 1114112 + + Step 1: + r₁ = n % base = 97 → 'a' + n = n / base = 97 + + Step 2: + r₀ = n % base = 97 → 'a' + n = 0 + + Reconstructed (after reversing): + "aa" +*/ func convertIntUnicodeToString(n *big.Int) string { if n.Cmp(big.NewInt(0)) == 0 { return "" @@ -417,11 +516,13 @@ func convertIntUnicodeToString(n *big.Int) string { base := big.NewInt(constants.UnicodeSize) x := new(big.Int).Set(n) var runes []rune + for x.Cmp(big.NewInt(0)) > 0 { rem := new(big.Int).Mod(x, base) runes = append(runes, rune(rem.Int64())) x.Div(x, base) } + slices.Reverse(runes) return string(runes) } diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go index a6ecf0a27..519984951 100644 --- a/pkg/jdbc/jdbc.go +++ b/pkg/jdbc/jdbc.go @@ -549,13 +549,13 @@ func MySQLCountGeneratedInRange(values []string, tableCollationType string, minV unionParts := make([]string, 0, len(values)) args := make([]any, 0, len(values)+2) + args = append(args, minVal, maxVal, maxVal, minVal) + for _, v := range values { unionParts = append(unionParts, "SELECT ? AS val") args = append(args, v) } - args = append(args, minVal, maxVal, maxVal, minVal) - query := fmt.Sprintf(` SELECT GREATEST( SUM(CASE From 66749ddbf07505c6da70008ef748948768ad215e Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Sun, 1 Mar 2026 11:15:32 +0530 Subject: [PATCH 15/23] chore: constant size readjusted --- constants/constants.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/constants/constants.go b/constants/constants.go index 1e176303c..f431f18df 100644 --- a/constants/constants.go +++ b/constants/constants.go @@ -30,7 +30,7 @@ const ( // DestinationDatabasePrefix is used as prefix for destination database name DestinationDatabasePrefix = "DESTINATION_DATABASE_PREFIX" // EffectiveParquetSize is the effective size in bytes considering 256mb targeted parquet size, compression ratio as 8 - EffectiveParquetSize = int64(1) * 1024 * int64(1) + EffectiveParquetSize = int64(256) * 1024 * 1024 * int64(8) DB2StateTimestampFormat = "2006-01-02 15:04:05.000000" DefaultStateTimestampFormat = "2006-01-02T15:04:05.000000000Z" // DistributionLower is the lower bound for distribution factor From fa24a2c7bd2726adb4d0124ab59a94eff82c10c2 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Mon, 2 Mar 2026 17:52:49 +0530 Subject: [PATCH 16/23] chore: saperated buildChunkConditionMySQL function from mssql --- drivers/mysql/internal/backfill.go | 7 +++--- pkg/jdbc/jdbc.go | 39 ++++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index a2827abda..e611d85ea 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -46,14 +46,15 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, logger.Debugf("Starting backfill from %v to %v with filter: %s, args: %v", chunk.Min, chunk.Max, filter, args) // Get chunks from state or calculate new ones stmt := "" + var chunkArgs []any if chunkColumn != "" { - stmt = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) + stmt, chunkArgs = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter) } else if len(pkColumns) > 0 { - stmt = jdbc.MysqlChunkScanQuery(stream, pkColumns, chunk, filter) + stmt, chunkArgs = jdbc.MysqlChunkScanQuery(stream, pkColumns, chunk, filter) } else { stmt = jdbc.MysqlLimitOffsetScanQuery(stream, chunk, filter) } - + args = append(chunkArgs, args...) logger.Debugf("Executing chunk query: %s", stmt) setter := jdbc.NewReader(ctx, stmt, func(ctx context.Context, query string, queryArgs ...any) (*sql.Rows, error) { return tx.QueryContext(ctx, query, args...) diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go index 860abce00..274d4e2b3 100644 --- a/pkg/jdbc/jdbc.go +++ b/pkg/jdbc/jdbc.go @@ -230,6 +230,32 @@ func PostgresChunkScanQuery(stream types.StreamInterface, filterColumn string, c return fmt.Sprintf(`SELECT * FROM %s WHERE %s`, quotedTable, chunkCond) } +// TODO: Common out buildChunkConditionMySQL for MSSQL, DB2, and other drivers where needed. +// MySQL-Specific Queries buildChunkConditionMySQL builds the condition for a chunk in MySQL +func buildChunkConditionMySQL(filterColumns []string, chunk types.Chunk, extraFilter string) (string, []any) { + quotedCols := QuoteColumns(filterColumns, constants.MySQL) + colTuple := "(" + strings.Join(quotedCols, ", ") + ")" + + var conditions []string + var args []any + if chunk.Min != nil { + conditions = append(conditions, fmt.Sprintf("%s >= (?)", colTuple)) + args = append(args, chunk.Min) + } + + if chunk.Max != nil { + conditions = append(conditions, fmt.Sprintf("%s < (?)", colTuple)) + args = append(args, chunk.Max) + } + + chunkCond := strings.Join(conditions, " AND ") + + if extraFilter != "" && chunkCond != "" { + chunkCond = fmt.Sprintf("(%s) AND (%s)", chunkCond, extraFilter) + } + return chunkCond, args +} + // buildLexicographicChunkCondition builds a WHERE condition for a chunk scan using // lexicographic OR-groups over multiple ordering columns. // @@ -326,13 +352,6 @@ func buildLexicographicChunkCondition(quotedColumns []string, chunk types.Chunk, return chunkCond } -// MySQL-Specific Queries -// buildChunkConditionMySQL builds the condition for a chunk in MySQL. -func buildChunkConditionMySQL(filterColumns []string, chunk types.Chunk, extraFilter string) string { - quotedCols := QuoteColumns(filterColumns, constants.MySQL) - return buildLexicographicChunkCondition(quotedCols, chunk, extraFilter) -} - // MysqlLimitOffsetScanQuery is used to get the rows func MysqlLimitOffsetScanQuery(stream types.StreamInterface, chunk types.Chunk, filter string) string { quotedTable := QuoteTable(stream.Namespace(), stream.Name(), constants.MySQL) @@ -354,10 +373,10 @@ func MysqlLimitOffsetScanQuery(stream types.StreamInterface, chunk types.Chunk, } // MySQLWithoutState builds a chunk scan query for MySql -func MysqlChunkScanQuery(stream types.StreamInterface, filterColumns []string, chunk types.Chunk, extraFilter string) string { - condition := buildChunkConditionMySQL(filterColumns, chunk, extraFilter) +func MysqlChunkScanQuery(stream types.StreamInterface, filterColumns []string, chunk types.Chunk, extraFilter string) (string, []any) { + condition, args := buildChunkConditionMySQL(filterColumns, chunk, extraFilter) quotedTable := QuoteTable(stream.Namespace(), stream.Name(), constants.MySQL) - return fmt.Sprintf("SELECT * FROM %s WHERE %s", quotedTable, condition) + return fmt.Sprintf("SELECT * FROM %s WHERE %s", quotedTable, condition), args } // MinMaxQueryMySQL returns the query to fetch MIN and MAX values of a column in a MySQL table From 84115689e4ef7fc5467f6917bf5727e8f35dea4f Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Tue, 3 Mar 2026 19:00:47 +0530 Subject: [PATCH 17/23] chore: fixed buildChunkConditionMySQL function for multiple colummns --- pkg/jdbc/jdbc.go | 85 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 74 insertions(+), 11 deletions(-) diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go index 274d4e2b3..507c3b1b9 100644 --- a/pkg/jdbc/jdbc.go +++ b/pkg/jdbc/jdbc.go @@ -234,24 +234,87 @@ func PostgresChunkScanQuery(stream types.StreamInterface, filterColumn string, c // MySQL-Specific Queries buildChunkConditionMySQL builds the condition for a chunk in MySQL func buildChunkConditionMySQL(filterColumns []string, chunk types.Chunk, extraFilter string) (string, []any) { quotedCols := QuoteColumns(filterColumns, constants.MySQL) - colTuple := "(" + strings.Join(quotedCols, ", ") + ")" - var conditions []string - var args []any - if chunk.Min != nil { - conditions = append(conditions, fmt.Sprintf("%s >= (?)", colTuple)) - args = append(args, chunk.Min) + splitBoundaryValues := func(boundary any) []string { + if boundary == nil { + return nil + } + str := utils.ConvertToString(boundary) + parts := strings.Split(str, ",") + for i, part := range parts { + parts[i] = strings.TrimSpace(part) + } + return parts } - if chunk.Max != nil { - conditions = append(conditions, fmt.Sprintf("%s < (?)", colTuple)) - args = append(args, chunk.Max) + // buildBound creates the expanded logic for: + // (c1, c2, c3) >= (v1, v2, v3) + // as: + // (c1 > v1) OR (c1 = v1 AND c2 > v2) OR (c1 = v1 AND c2 = v2 AND c3 >= v3) + // + // For upper bounds, it creates: + // (c1 < v1) OR (c1 = v1 AND c2 < v2) OR (c1 = v1 AND c2 = v2 AND c3 < v3) + buildBound := func(values []string, isLower bool) (string, []any) { + //note: values can never be empty + var args []any + orGroups := make([]string, 0, len(quotedCols)) + + for colIdx := 0; colIdx < len(quotedCols); colIdx++ { + andConds := make([]string, 0, colIdx+1) + + // Prefix columns must match exactly: c1 = v1 AND c2 = v2 ... + for prefixIdx := 0; prefixIdx < colIdx; prefixIdx++ { + if prefixIdx < len(values) { + andConds = append(andConds, fmt.Sprintf("%s = ?", quotedCols[prefixIdx])) + args = append(args, values[prefixIdx]) + } + } + + var op string + if isLower { + op = ">" + if colIdx == len(quotedCols)-1 { + op = ">=" + } + } else { + op = "<" + } + + if colIdx < len(values) { + andConds = append(andConds, fmt.Sprintf("%s %s ?", quotedCols[colIdx], op)) + args = append(args, values[colIdx]) + } + if len(andConds) > 0 { + orGroups = append(orGroups, "("+strings.Join(andConds, " AND ")+")") + } + } + + return "(" + strings.Join(orGroups, " OR ") + ")", args } - chunkCond := strings.Join(conditions, " AND ") + lowerValues := splitBoundaryValues(chunk.Min) + upperValues := splitBoundaryValues(chunk.Max) + + chunkCond := "" + var args []any + switch { + case chunk.Min != nil && chunk.Max != nil: + lowerCond, lowerArgs := buildBound(lowerValues, true) + upperCond, upperArgs := buildBound(upperValues, false) + if lowerCond != "" && upperCond != "" { + chunkCond = fmt.Sprintf("(%s) AND (%s)", lowerCond, upperCond) + args = append(args, lowerArgs...) + args = append(args, upperArgs...) + } + case chunk.Min != nil: + chunkCond, args = buildBound(lowerValues, true) + case chunk.Max != nil: + chunkCond, args = buildBound(upperValues, false) + } + // Combine with any additional filter if present. if extraFilter != "" && chunkCond != "" { - chunkCond = fmt.Sprintf("(%s) AND (%s)", chunkCond, extraFilter) + return fmt.Sprintf("(%s) AND (%s)", chunkCond, extraFilter), args } return chunkCond, args } From 64f31c1b5e44cd131f4a771b501fe6a310af7764 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Wed, 4 Mar 2026 15:04:56 +0530 Subject: [PATCH 18/23] chore: resolved comment for final-testing --- constants/constants.go | 11 +++-- drivers/mysql/internal/backfill.go | 72 ++++++++++++++++-------------- pkg/jdbc/jdbc.go | 8 ++-- 3 files changed, 51 insertions(+), 40 deletions(-) diff --git a/constants/constants.go b/constants/constants.go index f431f18df..8eba31a4d 100644 --- a/constants/constants.go +++ b/constants/constants.go @@ -33,12 +33,15 @@ const ( EffectiveParquetSize = int64(256) * 1024 * 1024 * int64(8) DB2StateTimestampFormat = "2006-01-02 15:04:05.000000" DefaultStateTimestampFormat = "2006-01-02T15:04:05.000000000Z" - // DistributionLower is the lower bound for distribution factor - DistributionLower = 0.05 - // DistributionUpper is the upper bound for distribution factor + // DistributionLower and DistributionUpper define the acceptable range + // of the distribution factor for validating evenly distributed numeric PKs. + DistributionLower = 0.05 DistributionUpper = 100.0 + // UnicodeSize is the total number of valid Unicode code points (0 to 0x10FFFF) UnicodeSize = 1114112 - MysqlChunkSizeReductionFactor = float64(0.8) + // MysqlChunkAcceptanceRatio defines the minimum ratio of expected chunks that must be generated + // for the split to be considered valid. + MysqlChunkAcceptanceRatio = float64(0.8) ) type DriverType string diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index e611d85ea..87f3fb7a0 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -64,7 +64,6 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, } func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPool, stream types.StreamInterface) (*types.Set[types.Chunk], error) { - var ( approxRowCount int64 avgRowSize any @@ -73,10 +72,10 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo dataMaxLength sql.NullInt64 ) - approxRowCountQuery := jdbc.MySQLTableRowStatsQuery() - err := m.client.QueryRowContext(ctx, approxRowCountQuery, stream.Name()).Scan(&approxRowCount, &avgRowSize, &avgSchemaSize, &tableCollationType) + tableStatsQuery := jdbc.MySQLTableStatsQuery() + err := m.client.QueryRowContext(ctx, tableStatsQuery, stream.Name()).Scan(&approxRowCount, &avgRowSize, &avgSchemaSize, &tableCollationType) if err != nil { - return nil, fmt.Errorf("failed to fetch RowStats query for table=%s: %v", stream.Name(), err) + return nil, fmt.Errorf("failed to fetch TableStats query for table=%s: %s", stream.Name(), err) } if approxRowCount == 0 { @@ -124,35 +123,40 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo if len(pkColumns) > 0 { minVal, maxVal, err = m.getTableExtremes(ctx, stream, pkColumns) if err != nil { - return nil, fmt.Errorf("Stream %s: Failed to get table extremes: %v", stream.ID(), err) + return nil, fmt.Errorf("Stream %s: Failed to get table extremes: %s", stream.ID(), err) } } - // Supported MySQL string-like PK datatypes - var stringTypes = map[string]struct{}{ - "char": {}, - "varchar": {}, - } //defining boolean to check if string is supported or not stringSupportedPk := false if len(pkColumns) == 1 { + // 1. Try Numeric Strategy isNumericAndEvenDistributed, chunkStepSize, minFloat, maxFloat, err = IsNumericAndEvenDistributed(minVal, maxVal, approxRowCount, chunkSize) if err != nil { - isNumericAndEvenDistributed = false - logger.Infof("Stream %s: PK is not numeric or conversion failed, falling back to string splitting: %v", stream.ID(), err) + logger.Infof("Stream %s: PK is not numeric or conversion failed, falling back to string splitting: %s", stream.ID(), err) } + + // 2. If not numeric, check for supported String strategy if !isNumericAndEvenDistributed { var dataType string + // Fetch column type query := jdbc.MySQLColumnTypeQuery() err = m.client.QueryRowContext(ctx, query, stream.Name(), pkColumns[0]).Scan(&dataType, &dataMaxLength) if err != nil { return nil, fmt.Errorf("failed to fetch Column DataType and max length %s", err) - } else if _, ok := stringTypes[dataType]; ok { - stringSupportedPk = true - logger.Infof("%s is a string type PK",pkColumns[0]) - if dataMaxLength.Valid { - logger.Infof("Data Max Length: %d", dataMaxLength.Int64) - } + } + + switch strings.ToLower(dataType){ + case "char", "varchar": + stringSupportedPk = true + logger.Infof("%s is a string type PK",pkColumns[0]) + if dataMaxLength.Valid { + logger.Infof("Data Max Length: %d", dataMaxLength.Int64) + } + + default: + stringSupportedPk = false + logger.Infof("%s is not a string type PK",pkColumns[0]) } } } @@ -327,53 +331,55 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo chunkdiff.Div(chunkdiff, big.NewInt(expectedChunks)) //ceil division set up rangeSlice := []string{} - for i := int64(0); i < int64(5); i++ { + // Try up to 5 times to generate balanced chunks by slightly adjusting the chunk size each iteration. + for retryAttempt := int64(0); retryAttempt < int64(5); retryAttempt++ { temporarychunkdiff := new(big.Int).Set(chunkdiff) - temporarychunkdiff.Add(temporarychunkdiff, big.NewInt(i)) - temporarychunkdiff.Div(temporarychunkdiff, big.NewInt(i+1)) + temporarychunkdiff.Add(temporarychunkdiff, big.NewInt(retryAttempt)) + temporarychunkdiff.Div(temporarychunkdiff, big.NewInt(retryAttempt+1)) curr := new(big.Int).Set(&minValBaseN) - for j := int64(0); j < expectedChunks && curr.Cmp(&maxValBaseN) < 0; j++ { + for chunkIdx := int64(0); chunkIdx < expectedChunks && curr.Cmp(&maxValBaseN) < 0; chunkIdx++ { rangeSlice = append(rangeSlice, convertIntUnicodeToString(curr)) curr.Add(curr, temporarychunkdiff) } + // Align boundaries with actual DB values using MySQL collation ordering rangeSlice = append(rangeSlice, convertIntUnicodeToString(&maxValBaseN)) query, args := jdbc.MySQLDistinctValuesWithCollationQuery(rangeSlice, tableCollationType) rows, err := m.client.QueryContext(ctx, query, args...) if err != nil { - return fmt.Errorf("failed to run distinct query: %v", err) + return fmt.Errorf("failed to run distinct query: %s", err) } rangeSlice = rangeSlice[:0] for rows.Next() { var val string if err := rows.Scan(&val); err != nil { - return fmt.Errorf("failed to scan row: %v", err) + return fmt.Errorf("failed to scan row: %s", err) } rangeSlice = append(rangeSlice, val) } rows.Close() + //counting the number of valid chunks generated i.e., between min and max query, args = jdbc.MySQLCountGeneratedInRange(rangeSlice, tableCollationType, minValPadded, maxValPadded) err = m.client.QueryRowContext(ctx, query, args...).Scan(&validChunksCount) if err != nil { - return fmt.Errorf("failed to run count query: %v", err) + return fmt.Errorf("failed to run count query: %s", err) } - if float64(validChunksCount) >= float64(expectedChunks)*constants.MysqlChunkSizeReductionFactor { + // Accept boundaries if enough valid chunks are produced + if float64(validChunksCount) >= float64(expectedChunks)*constants.MysqlChunkAcceptanceRatio { logger.Infof("Successfully Generated Chunks using splitEvenlyForString Method for stream %s", stream.ID()) - for i, val := range rangeSlice { - logger.Debugf("Boundary[%d] = %q", i, val) - } break } - if float64(validChunksCount) < float64(expectedChunks)*constants.MysqlChunkSizeReductionFactor && i == 4 { + //if the number of valid chunks generated is less than the expected chunks * a constant factor even after 5 iterations, we fallback to splitViaPrimaryKey + if float64(validChunksCount) < float64(expectedChunks)*constants.MysqlChunkAcceptanceRatio && retryAttempt == 4 { logger.Warnf("failed to generate chunks for stream %s, falling back to splitviaprimarykey method", stream.ID()) err = splitViaPrimaryKey(stream, chunks) if err != nil { - return fmt.Errorf("failed to generate chunks for stream %s: %v", stream.ID(), err) + return fmt.Errorf("failed to generate chunks for stream %s: %s", stream.ID(), err) } return nil } @@ -446,12 +452,12 @@ func IsNumericAndEvenDistributed(minVal any, maxVal any, approxRowCount int64, c minFloat, err1 := typeutils.ReformatFloat64(minVal) if err1 != nil { - return false, 0, 0, 0, fmt.Errorf("failed to parse minVal: %w", err1) + return false, 0, 0, 0, fmt.Errorf("failed to parse minVal: %s", err1) } maxFloat, err2 := typeutils.ReformatFloat64(maxVal) if err2 != nil { - return false, 0, 0, 0, fmt.Errorf("failed to parse maxVal: %w", err2) + return false, 0, 0, 0, fmt.Errorf("failed to parse maxVal: %s", err2) } distributionFactor := (maxFloat - minFloat + 1) / float64(approxRowCount) diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go index 507c3b1b9..bd0318a52 100644 --- a/pkg/jdbc/jdbc.go +++ b/pkg/jdbc/jdbc.go @@ -314,7 +314,7 @@ func buildChunkConditionMySQL(filterColumns []string, chunk types.Chunk, extraFi // Combine with any additional filter if present. if extraFilter != "" && chunkCond != "" { - return fmt.Sprintf("(%s) AND (%s)", chunkCond, extraFilter), args + chunkCond = fmt.Sprintf("(%s) AND (%s)", chunkCond, extraFilter) } return chunkCond, args } @@ -509,7 +509,7 @@ func MySQLPrimaryKeyQuery() string { } // MySQLTableRowStatsQuery returns the query to fetch the estimated row count and average row size of a table in MySQL -func MySQLTableRowStatsQuery() string { +func MySQLTableStatsQuery() string { return ` SELECT TABLE_ROWS, CEIL(data_length / NULLIF(table_rows, 0)) AS avg_row_bytes, @@ -520,7 +520,7 @@ func MySQLTableRowStatsQuery() string { AND TABLE_NAME = ? ` } - +// MySQLColumnTypeQuery returns a query that fetches the DATA_TYPE and CHARACTER_MAXIMUM_LENGTH of a column in MySQL. func MySQLColumnTypeQuery() string { return ` SELECT DATA_TYPE ,CHARACTER_MAXIMUM_LENGTH @@ -554,6 +554,8 @@ func MySQLDistinctValuesWithCollationQuery(values []string, tableCollationType s return query, args } +// MySQLCountGeneratedInRange builds a query that counts how many values from the provided slice +// fall within [minVal, maxVal] using the table's collation ordering. func MySQLCountGeneratedInRange(values []string, tableCollationType string, minVal, maxVal string) (string, []any) { if len(values) == 0 { return "", nil From debd4eb796b306e5dcca457ddd35f75e8477838c Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Wed, 4 Mar 2026 15:10:49 +0530 Subject: [PATCH 19/23] chore: resolved lint error --- constants/constants.go | 6 +++--- pkg/jdbc/jdbc.go | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/constants/constants.go b/constants/constants.go index 8eba31a4d..c90429970 100644 --- a/constants/constants.go +++ b/constants/constants.go @@ -35,10 +35,10 @@ const ( DefaultStateTimestampFormat = "2006-01-02T15:04:05.000000000Z" // DistributionLower and DistributionUpper define the acceptable range // of the distribution factor for validating evenly distributed numeric PKs. - DistributionLower = 0.05 - DistributionUpper = 100.0 + DistributionLower = 0.05 + DistributionUpper = 100.0 // UnicodeSize is the total number of valid Unicode code points (0 to 0x10FFFF) - UnicodeSize = 1114112 + UnicodeSize = 1114112 // MysqlChunkAcceptanceRatio defines the minimum ratio of expected chunks that must be generated // for the split to be considered valid. MysqlChunkAcceptanceRatio = float64(0.8) diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go index bd0318a52..122fd99c8 100644 --- a/pkg/jdbc/jdbc.go +++ b/pkg/jdbc/jdbc.go @@ -520,6 +520,7 @@ func MySQLTableStatsQuery() string { AND TABLE_NAME = ? ` } + // MySQLColumnTypeQuery returns a query that fetches the DATA_TYPE and CHARACTER_MAXIMUM_LENGTH of a column in MySQL. func MySQLColumnTypeQuery() string { return ` @@ -554,7 +555,7 @@ func MySQLDistinctValuesWithCollationQuery(values []string, tableCollationType s return query, args } -// MySQLCountGeneratedInRange builds a query that counts how many values from the provided slice +// MySQLCountGeneratedInRange builds a query that counts how many values from the provided slice // fall within [minVal, maxVal] using the table's collation ordering. func MySQLCountGeneratedInRange(values []string, tableCollationType string, minVal, maxVal string) (string, []any) { if len(values) == 0 { From 8ead67ea5c605b7455a4083545ea087fe1e75eb4 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Mon, 9 Mar 2026 12:04:32 +0530 Subject: [PATCH 20/23] chore: float and uint8 issue resolved --- drivers/mysql/internal/backfill.go | 177 +++++++++++++++-------------- pkg/jdbc/jdbc.go | 31 ++--- types/set.go | 6 + utils/typeutils/reformat.go | 12 ++ 4 files changed, 121 insertions(+), 105 deletions(-) diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index 87f3fb7a0..e00bc9f18 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -63,17 +63,18 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface, }) } +// TODO: Separate chunking-related logic from this function so the individual components can be unit tested independently. func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPool, stream types.StreamInterface) (*types.Set[types.Chunk], error) { var ( approxRowCount int64 avgRowSize any - avgSchemaSize int64 - tableCollationType string + approxTableSize int64 + columnCollationType string dataMaxLength sql.NullInt64 ) tableStatsQuery := jdbc.MySQLTableStatsQuery() - err := m.client.QueryRowContext(ctx, tableStatsQuery, stream.Name()).Scan(&approxRowCount, &avgRowSize, &avgSchemaSize, &tableCollationType) + err := m.client.QueryRowContext(ctx, tableStatsQuery, stream.Name()).Scan(&approxRowCount, &avgRowSize, &approxTableSize) if err != nil { return nil, fmt.Errorf("failed to fetch TableStats query for table=%s: %s", stream.Name(), err) } @@ -106,12 +107,11 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo chunkColumn := stream.Self().StreamMetadata.ChunkColumn var ( - isNumericAndEvenDistributed bool - chunkStepSize int64 - minVal any //to define lower range of the chunk - maxVal any //to define upper range of the chunk - minFloat float64 - maxFloat float64 + chunkStepSize int64 + minVal any // to define lower range of the chunk + maxVal any // to define upper range of the chunk + minFloat float64 + maxFloat float64 ) pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array() @@ -126,37 +126,30 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo return nil, fmt.Errorf("Stream %s: Failed to get table extremes: %s", stream.ID(), err) } } - //defining boolean to check if string is supported or not + stringSupportedPk := false if len(pkColumns) == 1 { // 1. Try Numeric Strategy - isNumericAndEvenDistributed, chunkStepSize, minFloat, maxFloat, err = IsNumericAndEvenDistributed(minVal, maxVal, approxRowCount, chunkSize) - if err != nil { - logger.Infof("Stream %s: PK is not numeric or conversion failed, falling back to string splitting: %s", stream.ID(), err) - } - + chunkStepSize, minFloat, maxFloat = IsNumericAndEvenDistributed(minVal, maxVal, approxRowCount, chunkSize) // 2. If not numeric, check for supported String strategy - if !isNumericAndEvenDistributed { + if chunkStepSize == 0 { var dataType string - // Fetch column type - query := jdbc.MySQLColumnTypeQuery() - err = m.client.QueryRowContext(ctx, query, stream.Name(), pkColumns[0]).Scan(&dataType, &dataMaxLength) + query := jdbc.MySQLColumnStatsQuery() + err = m.client.QueryRowContext(ctx, query, stream.Name(), pkColumns[0]).Scan(&dataType, &dataMaxLength, &columnCollationType) if err != nil { return nil, fmt.Errorf("failed to fetch Column DataType and max length %s", err) } - switch strings.ToLower(dataType){ - case "char", "varchar": - stringSupportedPk = true - logger.Infof("%s is a string type PK",pkColumns[0]) - if dataMaxLength.Valid { - logger.Infof("Data Max Length: %d", dataMaxLength.Int64) - } - - default: - stringSupportedPk = false - logger.Infof("%s is not a string type PK",pkColumns[0]) + switch strings.ToLower(dataType) { + case "char", "varchar": + stringSupportedPk = true + logger.Infof("%s is a string type PK", pkColumns[0]) + if dataMaxLength.Valid { + logger.Infof("Data Max Length: %d", dataMaxLength.Int64) + } + default: + logger.Infof("%s is not a string type PK", pkColumns[0]) } } } @@ -254,13 +247,20 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo Chunks formed: (-∞, 0), [0,25), [25,50), [50,75), [75,100), [100, +∞) */ - splitEvenlyForInt := func(chunks *types.Set[types.Chunk], chunkStepSize float64) { + splitEvenlyForInt := func(chunks *types.Set[types.Chunk], chunkStepSize float64) error { chunks.Insert(types.Chunk{ Min: nil, Max: utils.ConvertToString(minFloat), }) prev := minFloat for next := minFloat + chunkStepSize; next <= maxFloat; next += chunkStepSize { + // Detect float precision collapse to protect infinite loop + if next <= prev { + logger.Warnf("float precision collapse detected, falling back to SplitViaPrimaryKey for stream %s", stream.ID()) + chunks.Clear() + err := splitViaPrimaryKey(stream, chunks) + return err + } chunks.Insert(types.Chunk{ Min: utils.ConvertToString(prev), Max: utils.ConvertToString(next), @@ -272,6 +272,7 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo Max: nil, }) logger.Infof("Chunking completed using splitEvenlyForInt Method for stream %s", stream.ID()) + return nil } /* @@ -281,7 +282,7 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo Workflow: 1. Convert min and max string values into padded form and map them into big.Int using unicode-based encoding. - 2. Estimate the expected number of chunks based on schema size and target file size. + 2. Estimate the expected number of chunks based on table size and target file size. 3. Compute an initial chunk interval using ceil division on the numeric range. 4. Iteratively (up to 5 attempts): - Adjust the interval using an AP-based variation. @@ -305,64 +306,64 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo (-∞, "aa"), ["aa","ai"), ["ai","ar"), ["ar","az"), ["az", +∞) */ splitEvenlyForString := func(chunks *types.Set[types.Chunk]) error { - var maxValBaseN, minValBaseN big.Int var validChunksCount int maxValPadded := utils.ConvertToString(maxVal) minValPadded := utils.ConvertToString(minVal) if dataMaxLength.Valid { - maxValPadded = padRightNull(maxValPadded, int(dataMaxLength.Int64)) - minValPadded = padRightNull(minValPadded, int(dataMaxLength.Int64)) + maxValPadded = padRightWithNulls(maxValPadded, int(dataMaxLength.Int64)) + minValPadded = padRightWithNulls(minValPadded, int(dataMaxLength.Int64)) } - val1 := convertUnicodeStringToInt(maxValPadded) - maxValBaseN.Set(&val1) - val2 := convertUnicodeStringToInt(minValPadded) - minValBaseN.Set(&val2) + maxEncodedBigIntValue := encodeUnicodeStringToBigInt(maxValPadded) + minEncodedBigIntValue := encodeUnicodeStringToBigInt(minValPadded) - expectedChunks := int64(math.Ceil(float64(avgSchemaSize) / float64(constants.EffectiveParquetSize))) - if expectedChunks <= 0 { - expectedChunks = 1 - } + expectedChunks := int64(math.Ceil(float64(approxTableSize) / float64(constants.EffectiveParquetSize))) + expectedChunks = utils.Ternary(expectedChunks <= 0, int64(1), expectedChunks).(int64) - chunkdiff := new(big.Int).Sub(&maxValBaseN, &minValBaseN) - chunkdiff.Add(chunkdiff, new(big.Int).Sub(big.NewInt(expectedChunks), big.NewInt(1))) - chunkdiff.Div(chunkdiff, big.NewInt(expectedChunks)) //ceil division set up + chunkDiff := new(big.Int).Sub(&maxEncodedBigIntValue, &minEncodedBigIntValue) + chunkDiff.Add(chunkDiff, new(big.Int).Sub(big.NewInt(expectedChunks), big.NewInt(1))) + chunkDiff.Div(chunkDiff, big.NewInt(expectedChunks)) //ceil division set up rangeSlice := []string{} // Try up to 5 times to generate balanced chunks by slightly adjusting the chunk size each iteration. for retryAttempt := int64(0); retryAttempt < int64(5); retryAttempt++ { - temporarychunkdiff := new(big.Int).Set(chunkdiff) - temporarychunkdiff.Add(temporarychunkdiff, big.NewInt(retryAttempt)) - temporarychunkdiff.Div(temporarychunkdiff, big.NewInt(retryAttempt+1)) - curr := new(big.Int).Set(&minValBaseN) - - for chunkIdx := int64(0); chunkIdx < expectedChunks && curr.Cmp(&maxValBaseN) < 0; chunkIdx++ { - rangeSlice = append(rangeSlice, convertIntUnicodeToString(curr)) - curr.Add(curr, temporarychunkdiff) + temporaryChunkDiff := new(big.Int).Set(chunkDiff) + temporaryChunkDiff.Add(temporaryChunkDiff, big.NewInt(retryAttempt)) + temporaryChunkDiff.Div(temporaryChunkDiff, big.NewInt(retryAttempt+1)) + currentBoundary := new(big.Int).Set(&minEncodedBigIntValue) + + for chunkIdx := int64(0); chunkIdx < expectedChunks*(retryAttempt+1) && currentBoundary.Cmp(&maxEncodedBigIntValue) < 0; chunkIdx++ { + rangeSlice = append(rangeSlice, decodeBigIntToUnicodeString(currentBoundary)) + currentBoundary.Add(currentBoundary, temporaryChunkDiff) } // Align boundaries with actual DB values using MySQL collation ordering - rangeSlice = append(rangeSlice, convertIntUnicodeToString(&maxValBaseN)) - query, args := jdbc.MySQLDistinctValuesWithCollationQuery(rangeSlice, tableCollationType) + rangeSlice = append(rangeSlice, decodeBigIntToUnicodeString(&maxEncodedBigIntValue)) + query, args := jdbc.MySQLDistinctValuesWithCollationQuery(rangeSlice, columnCollationType) rows, err := m.client.QueryContext(ctx, query, args...) if err != nil { return fmt.Errorf("failed to run distinct query: %s", err) } rangeSlice = rangeSlice[:0] - + // Some chunks generated might be completely empty when boundaries greater + // than the max value and smaller than the min value exists for rows.Next() { var val string if err := rows.Scan(&val); err != nil { + rows.Close() return fmt.Errorf("failed to scan row: %s", err) } rangeSlice = append(rangeSlice, val) } - rows.Close() - //counting the number of valid chunks generated i.e., between min and max - query, args = jdbc.MySQLCountGeneratedInRange(rangeSlice, tableCollationType, minValPadded, maxValPadded) + if err := rows.Err(); err != nil { + return fmt.Errorf("row iteration error during distinct boundaries iteration: %s", err) + } + + // Counting the number of valid chunks generated i.e., between min and max + query, args = jdbc.MySQLCountGeneratedInRange(rangeSlice, columnCollationType, minValPadded, maxValPadded) err = m.client.QueryRowContext(ctx, query, args...).Scan(&validChunksCount) if err != nil { return fmt.Errorf("failed to run count query: %s", err) @@ -374,7 +375,7 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo break } - //if the number of valid chunks generated is less than the expected chunks * a constant factor even after 5 iterations, we fallback to splitViaPrimaryKey + // If the number of valid chunks generated is less than the expected chunks * a constant factor even after 5 iterations, we fallback to splitViaPrimaryKey if float64(validChunksCount) < float64(expectedChunks)*constants.MysqlChunkAcceptanceRatio && retryAttempt == 4 { logger.Warnf("failed to generate chunks for stream %s, falling back to splitviaprimarykey method", stream.ID()) err = splitViaPrimaryKey(stream, chunks) @@ -418,18 +419,15 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo } switch { - case len(pkColumns) == 1 && isNumericAndEvenDistributed: + case len(pkColumns) == 1 && chunkStepSize > 0: logger.Infof("Using splitEvenlyForInt Method for stream %s", stream.ID()) - splitEvenlyForInt(chunks, float64(chunkStepSize)) - + err = splitEvenlyForInt(chunks, float64(chunkStepSize)) case len(pkColumns) == 1 && stringSupportedPk: logger.Infof("Using splitEvenlyForString Method for stream %s", stream.ID()) err = splitEvenlyForString(chunks) - - case len(pkColumns) > 1: + case len(pkColumns) > 0: logger.Infof("Using SplitViaPrimaryKey Method for stream %s", stream.ID()) err = splitViaPrimaryKey(stream, chunks) - default: logger.Infof("Falling back to limit offset method for stream %s", stream.ID()) err = limitOffsetChunking(chunks) @@ -445,33 +443,36 @@ func (m *MySQL) getTableExtremes(ctx context.Context, stream types.StreamInterfa } // checks if the pk column is numeric and evenly distributed -func IsNumericAndEvenDistributed(minVal any, maxVal any, approxRowCount int64, chunkSize int64) (bool, int64, float64, float64, error) { - if approxRowCount == 0 { - return false, 0, 0, 0, nil +func IsNumericAndEvenDistributed(minVal any, maxVal any, approxRowCount int64, chunkSize int64) (int64, float64, float64) { + minFloat, err := typeutils.ReformatFloat64(minVal) + if err != nil { + logger.Debugf("failed to parse minVal: %s", err) + return 0, 0, 0 } - minFloat, err1 := typeutils.ReformatFloat64(minVal) - if err1 != nil { - return false, 0, 0, 0, fmt.Errorf("failed to parse minVal: %s", err1) + maxFloat, err := typeutils.ReformatFloat64(maxVal) + if err != nil { + logger.Debugf("failed to parse maxVal: %s", err) + return 0, 0, 0 } - - maxFloat, err2 := typeutils.ReformatFloat64(maxVal) - if err2 != nil { - return false, 0, 0, 0, fmt.Errorf("failed to parse maxVal: %s", err2) + if maxFloat > float64(math.MaxInt64) || minFloat < float64(math.MinInt64) { + logger.Debugf("Numeric range exceeds signed int64 limits, forcing PK fallback") + return 0, 0, 0 } distributionFactor := (maxFloat - minFloat + 1) / float64(approxRowCount) if distributionFactor < constants.DistributionLower || distributionFactor > constants.DistributionUpper { - return false, 0, 0, 0, fmt.Errorf("distribution factor is not in the range of %f to %f", constants.DistributionLower, constants.DistributionUpper) + logger.Debugf("distribution factor is not in the range of %f to %f", constants.DistributionLower, constants.DistributionUpper) + return 0, 0, 0 } chunkStepSize := int64(math.Max(distributionFactor*float64(chunkSize), 1)) - return true, chunkStepSize, minFloat, maxFloat, nil + return chunkStepSize, minFloat, maxFloat } /* - convertUnicodeStringToInt maps a string to a big.Int using base = 1114112(UnicodeSize), treating each rune as a digit in a positional system. + encodeUnicodeStringToBigInt maps a string to a big.Int using base = 1114112(UnicodeSize), treating each rune as a digit in a positional system. Value = r₀*base^(n-1) + r₁*base^(n-2) + ... + rₙ @@ -480,11 +481,11 @@ func IsNumericAndEvenDistributed(minVal any, maxVal any, approxRowCount int64, c r₀ = 'a' = 97, r₁ = 'a' = 97, base = 1114112 Value = r₀*base^(n-1) + r₁*base^(n-2) + = 97*1114112 + 97 = 108068961 - */ -func convertUnicodeStringToInt(s string) big.Int { +func encodeUnicodeStringToBigInt(s string) big.Int { base := big.NewInt(constants.UnicodeSize) val := big.NewInt(0) @@ -496,7 +497,7 @@ func convertUnicodeStringToInt(s string) big.Int { } /* - convertIntUnicodeToString reconstructs the original string from its big.Int representation by extracting digits in base = 1114112 (UnicodeSize). + decodeBigIntToUnicodeString reconstructs the original string from its big.Int representation by extracting digits in base = 1114112 (UnicodeSize). It repeatedly takes modulus and division by base to recover each rune: rᵢ = n % base, then n = n / base @@ -515,7 +516,7 @@ func convertUnicodeStringToInt(s string) big.Int { Reconstructed (after reversing): "aa" */ -func convertIntUnicodeToString(n *big.Int) string { +func decodeBigIntToUnicodeString(n *big.Int) string { if n.Cmp(big.NewInt(0)) == 0 { return "" } @@ -533,7 +534,13 @@ func convertIntUnicodeToString(n *big.Int) string { return string(runes) } -func padRightNull(s string, maxLength int) string { +/* + Padding a string with null characters to a specified length. + + Example: + padRightWithNulls("aa", 4) = "aa\x00\x00" +*/ +func padRightWithNulls(s string, maxLength int) string { length := utf8.RuneCountInString(s) if length >= maxLength { return s diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go index 7d2bf8183..359716dda 100644 --- a/pkg/jdbc/jdbc.go +++ b/pkg/jdbc/jdbc.go @@ -255,7 +255,6 @@ func buildChunkConditionMySQL(filterColumns []string, chunk types.Chunk, extraFi // For upper bounds, it creates: // (c1 < v1) OR (c1 = v1 AND c2 < v2) OR (c1 = v1 AND c2 = v2 AND c3 < v3) buildBound := func(values []string, isLower bool) (string, []any) { - //note: values can never be empty var args []any orGroups := make([]string, 0, len(quotedCols)) @@ -508,23 +507,22 @@ func MySQLPrimaryKeyQuery() string { ` } -// MySQLTableRowStatsQuery returns the query to fetch the estimated row count and average row size of a table in MySQL +// MySQLTableStatsQuery returns the query to fetch the estimated row count and average row size of a table in MySQL func MySQLTableStatsQuery() string { return ` SELECT TABLE_ROWS, CEIL(data_length / NULLIF(table_rows, 0)) AS avg_row_bytes, - DATA_LENGTH, - TABLE_COLLATION + DATA_LENGTH FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ? ` } -// MySQLColumnTypeQuery returns a query that fetches the DATA_TYPE and CHARACTER_MAXIMUM_LENGTH of a column in MySQL. -func MySQLColumnTypeQuery() string { +// MySQLColumnStatsQuery returns a query that fetches the DATA_TYPE, CHARACTER_MAXIMUM_LENGTH and Collation type of a column in MySQL. +func MySQLColumnStatsQuery() string { return ` - SELECT DATA_TYPE ,CHARACTER_MAXIMUM_LENGTH + SELECT DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, COALESCE(COLLATION_NAME, '') FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ? @@ -534,11 +532,8 @@ func MySQLColumnTypeQuery() string { } // MySQLDistinctValuesWithCollationQuery builds a DISTINCT query over a slice of strings -// using the table's collation type. -func MySQLDistinctValuesWithCollationQuery(values []string, tableCollationType string) (string, []any) { - if len(values) == 0 { - return "", nil - } +// using the column's collation type. +func MySQLDistinctValuesWithCollationQuery(values []string, columnCollationType string) (string, []any) { unionParts := make([]string, 0, len(values)) args := make([]any, 0, len(values)) for _, v := range values { @@ -551,17 +546,13 @@ func MySQLDistinctValuesWithCollationQuery(values []string, tableCollationType s %s ) AS t ORDER BY val COLLATE %s; - `, tableCollationType, strings.Join(unionParts, "\nUNION ALL\n"), tableCollationType) + `, columnCollationType, strings.Join(unionParts, " UNION ALL "), columnCollationType) return query, args } // MySQLCountGeneratedInRange builds a query that counts how many values from the provided slice -// fall within [minVal, maxVal] using the table's collation ordering. -func MySQLCountGeneratedInRange(values []string, tableCollationType string, minVal, maxVal string) (string, []any) { - if len(values) == 0 { - return "", nil - } - +// fall within [minVal, maxVal] using the column's collation ordering. +func MySQLCountGeneratedInRange(values []string, columnCollationType string, minVal, maxVal string) (string, []any) { unionParts := make([]string, 0, len(values)) args := make([]any, 0, len(values)+2) @@ -584,7 +575,7 @@ func MySQLCountGeneratedInRange(values []string, tableCollationType string, minV FROM ( %s ) AS t; - `, tableCollationType, tableCollationType, tableCollationType, tableCollationType, strings.Join(unionParts, "\nUNION ALL\n")) + `, columnCollationType, columnCollationType, columnCollationType, columnCollationType, strings.Join(unionParts, " UNION ALL ")) return query, args } diff --git a/types/set.go b/types/set.go index 9ae2eec13..0d0cb8688 100644 --- a/types/set.go +++ b/types/set.go @@ -208,3 +208,9 @@ func (st *Set[T]) UnmarshalJSON(data []byte) error { func (st *Set[T]) MarshalJSON() ([]byte, error) { return json.Marshal(st.Array()) } + +// Clear removes all elements from the set +func (st *Set[T]) Clear() { + st.hash = make(map[string]nothing) + st.storage = make(map[string]T) +} diff --git a/utils/typeutils/reformat.go b/utils/typeutils/reformat.go index b5bfdb474..581490505 100644 --- a/utils/typeutils/reformat.go +++ b/utils/typeutils/reformat.go @@ -311,6 +311,18 @@ func ReformatInt64(v any) (int64, error) { case uint64: //nolint:gosec // G115: converting uint64 to int64 is safe for expected ranges return int64(v), nil + case []uint8: + strVal := string(v) + intValue, err := strconv.ParseInt(strVal, 10, 64) + if err == nil { + return intValue, nil + } + uintValue, err := strconv.ParseUint(strVal, 10, 64) + if err != nil { + return int64(0), fmt.Errorf("failed to change []byte %v to int64: %v", v, err) + } + //nolint:gosec // G115: converting []uint8 to int64 is safe and required for backward compatibility + return int64(uintValue), nil case bool: if v { return 1, nil From 0caf2aa753bc29f2ddb51fd92be2066d91855b09 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Mon, 9 Mar 2026 16:30:02 +0530 Subject: [PATCH 21/23] chore: converted float64 to int64 --- drivers/mysql/internal/backfill.go | 87 +++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 24 deletions(-) diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index e00bc9f18..ddd26b0f2 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -3,11 +3,13 @@ package driver import ( "context" "database/sql" + "encoding/json" "fmt" "math" "math/big" "slices" "sort" + "strconv" "strings" "unicode/utf8" @@ -110,8 +112,8 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo chunkStepSize int64 minVal any // to define lower range of the chunk maxVal any // to define upper range of the chunk - minFloat float64 - maxFloat float64 + minInt64 int64 + maxInt64 int64 ) pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array() @@ -131,7 +133,7 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo if len(pkColumns) == 1 { // 1. Try Numeric Strategy - chunkStepSize, minFloat, maxFloat = IsNumericAndEvenDistributed(minVal, maxVal, approxRowCount, chunkSize) + chunkStepSize, minInt64, maxInt64 = IsNumericAndEvenDistributed(minVal, maxVal, approxRowCount, chunkSize) // 2. If not numeric, check for supported String strategy if chunkStepSize == 0 { var dataType string @@ -233,13 +235,13 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo } /* - splitEvenlyForInt generates chunk boundaries for numeric values by dividing the range [minFloat, maxFloat] using an arithmetic progression (AP). + splitEvenlyForInt generates chunk boundaries for numeric values by dividing the range [minInt64, maxInt64] using an arithmetic progression (AP). Each boundary follows: next = prev + chunkStepSize Example: - minFloat = 0, maxFloat = 100, chunkStepSize = 25 + minInt64 = 0, maxInt64 = 100, chunkStepSize = 25 AP sequence: 0 → 25 → 50 → 75 → 100 @@ -247,19 +249,18 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo Chunks formed: (-∞, 0), [0,25), [25,50), [50,75), [75,100), [100, +∞) */ - splitEvenlyForInt := func(chunks *types.Set[types.Chunk], chunkStepSize float64) error { + splitEvenlyForInt := func(chunks *types.Set[types.Chunk], chunkStepSize int64) error { chunks.Insert(types.Chunk{ Min: nil, - Max: utils.ConvertToString(minFloat), + Max: utils.ConvertToString(minInt64), }) - prev := minFloat - for next := minFloat + chunkStepSize; next <= maxFloat; next += chunkStepSize { - // Detect float precision collapse to protect infinite loop + prev := minInt64 + for next := minInt64 + chunkStepSize; next <= maxInt64; next += chunkStepSize { + // condition to protect from infinite loop if next <= prev { - logger.Warnf("float precision collapse detected, falling back to SplitViaPrimaryKey for stream %s", stream.ID()) + logger.Warnf("int precision collapse detected, falling back to SplitViaPrimaryKey for stream %s", stream.ID()) chunks.Clear() - err := splitViaPrimaryKey(stream, chunks) - return err + return splitViaPrimaryKey(stream, chunks) } chunks.Insert(types.Chunk{ Min: utils.ConvertToString(prev), @@ -421,7 +422,7 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo switch { case len(pkColumns) == 1 && chunkStepSize > 0: logger.Infof("Using splitEvenlyForInt Method for stream %s", stream.ID()) - err = splitEvenlyForInt(chunks, float64(chunkStepSize)) + err = splitEvenlyForInt(chunks, chunkStepSize) case len(pkColumns) == 1 && stringSupportedPk: logger.Infof("Using splitEvenlyForString Method for stream %s", stream.ID()) err = splitEvenlyForString(chunks) @@ -443,32 +444,33 @@ func (m *MySQL) getTableExtremes(ctx context.Context, stream types.StreamInterfa } // checks if the pk column is numeric and evenly distributed -func IsNumericAndEvenDistributed(minVal any, maxVal any, approxRowCount int64, chunkSize int64) (int64, float64, float64) { - minFloat, err := typeutils.ReformatFloat64(minVal) +func IsNumericAndEvenDistributed(minVal any, maxVal any, approxRowCount int64, chunkSize int64) (int64, int64, int64) { + if exceedsInt64Limits(minVal) || exceedsInt64Limits(maxVal) { + logger.Debugf("minVal or maxVal exceeds int64 limits") + return 0, 0, 0 + } + + minInt64, err := typeutils.ReformatInt64(minVal) if err != nil { logger.Debugf("failed to parse minVal: %s", err) return 0, 0, 0 } - maxFloat, err := typeutils.ReformatFloat64(maxVal) + maxInt64, err := typeutils.ReformatInt64(maxVal) if err != nil { logger.Debugf("failed to parse maxVal: %s", err) return 0, 0, 0 } - if maxFloat > float64(math.MaxInt64) || minFloat < float64(math.MinInt64) { - logger.Debugf("Numeric range exceeds signed int64 limits, forcing PK fallback") - return 0, 0, 0 - } - distributionFactor := (maxFloat - minFloat + 1) / float64(approxRowCount) + distributionFactor := float64(maxInt64 - minInt64 + 1) / float64(approxRowCount) if distributionFactor < constants.DistributionLower || distributionFactor > constants.DistributionUpper { logger.Debugf("distribution factor is not in the range of %f to %f", constants.DistributionLower, constants.DistributionUpper) return 0, 0, 0 } - chunkStepSize := int64(math.Max(distributionFactor*float64(chunkSize), 1)) - return chunkStepSize, minFloat, maxFloat + chunkStepSize := int64(math.Ceil(math.Max(distributionFactor*float64(chunkSize), 1))) + return chunkStepSize, minInt64, maxInt64 } /* @@ -547,3 +549,40 @@ func padRightWithNulls(s string, maxLength int) string { } return s + strings.Repeat("\x00", maxLength-length) } + +// checks if a value exceeds int64 limits +func exceedsInt64Limits(val any) bool { + switch v := val.(type) { + case json.Number: + if _, err := v.Int64(); err == nil { + return false + } + if floatVal, err := v.Float64(); err == nil { + return floatVal > float64(math.MaxInt64) || floatVal < float64(math.MinInt64) + } + return true + case float64: + return v > float64(math.MaxInt64) || v < float64(math.MinInt64) + case uint64: + return v > math.MaxInt64 + case uint: + return uint64(v) > math.MaxInt64 + case []uint8: + return exceedsInt64Limits(string(v)) + case string: + if _, err := strconv.ParseInt(v, 10, 64); err == nil { + return false + } + if u, err := strconv.ParseUint(v, 10, 64); err == nil { + return u > math.MaxInt64 + } + return true + case *any: + if v != nil { + return exceedsInt64Limits(*v) + } + return false + default: + return false + } +} From 8ccfdd6bf84abf0cf1d313a4f8a2f38c4e5bccc9 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Wed, 11 Mar 2026 15:45:48 +0530 Subject: [PATCH 22/23] chore: added uint8[] block and took datatype for numeric value from iceberg --- constants/state_version.go | 6 +- drivers/mysql/internal/backfill.go | 98 +++++++++--------------------- utils/typeutils/reformat.go | 25 ++++---- 3 files changed, 48 insertions(+), 81 deletions(-) diff --git a/constants/state_version.go b/constants/state_version.go index 2fbf0d6b3..24afc218d 100644 --- a/constants/state_version.go +++ b/constants/state_version.go @@ -28,9 +28,13 @@ package constants // // - Version 4: (Current Version) Unsigned int/integer/bigint map to Int64. // * Earlier unsigned int/integer/bigint were mapped to Int32 which caused integer overflows. +// +// - Version 5: (Current Version) Added []uint8 (byte slice) support in ReformatInt64 +// * Previously, numeric values returned as byte slices (common in some SQL drivers) caused errors +// * Now these byte slices are parsed and converted into int64 const ( - LatestStateVersion = 4 + LatestStateVersion = 5 ) // Used as the current version of the state when the program is running diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go index ddd26b0f2..38032f64e 100644 --- a/drivers/mysql/internal/backfill.go +++ b/drivers/mysql/internal/backfill.go @@ -3,13 +3,11 @@ package driver import ( "context" "database/sql" - "encoding/json" "fmt" "math" "math/big" "slices" "sort" - "strconv" "strings" "unicode/utf8" @@ -112,8 +110,8 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo chunkStepSize int64 minVal any // to define lower range of the chunk maxVal any // to define upper range of the chunk - minInt64 int64 - maxInt64 int64 + minBoundary int64 + maxBoundary int64 ) pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array() @@ -132,17 +130,17 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo stringSupportedPk := false if len(pkColumns) == 1 { + var dataType string + query := jdbc.MySQLColumnStatsQuery() + err = m.client.QueryRowContext(ctx, query, stream.Name(), pkColumns[0]).Scan(&dataType, &dataMaxLength, &columnCollationType) + if err != nil { + return nil, fmt.Errorf("failed to fetch Column DataType and max length %s", err) + } // 1. Try Numeric Strategy - chunkStepSize, minInt64, maxInt64 = IsNumericAndEvenDistributed(minVal, maxVal, approxRowCount, chunkSize) + chunkStepSize, minBoundary, maxBoundary = IsNumericAndEvenDistributed(minVal, maxVal, approxRowCount, chunkSize, dataType) + // 2. If not numeric, check for supported String strategy if chunkStepSize == 0 { - var dataType string - query := jdbc.MySQLColumnStatsQuery() - err = m.client.QueryRowContext(ctx, query, stream.Name(), pkColumns[0]).Scan(&dataType, &dataMaxLength, &columnCollationType) - if err != nil { - return nil, fmt.Errorf("failed to fetch Column DataType and max length %s", err) - } - switch strings.ToLower(dataType) { case "char", "varchar": stringSupportedPk = true @@ -235,13 +233,13 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo } /* - splitEvenlyForInt generates chunk boundaries for numeric values by dividing the range [minInt64, maxInt64] using an arithmetic progression (AP). + splitEvenlyForInt generates chunk boundaries for numeric values by dividing the range [minBoundary, maxBoundary] using an arithmetic progression (AP). Each boundary follows: next = prev + chunkStepSize Example: - minInt64 = 0, maxInt64 = 100, chunkStepSize = 25 + minBoundary = 0, maxBoundary = 100, chunkStepSize = 25 AP sequence: 0 → 25 → 50 → 75 → 100 @@ -252,10 +250,10 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo splitEvenlyForInt := func(chunks *types.Set[types.Chunk], chunkStepSize int64) error { chunks.Insert(types.Chunk{ Min: nil, - Max: utils.ConvertToString(minInt64), + Max: utils.ConvertToString(minBoundary), }) - prev := minInt64 - for next := minInt64 + chunkStepSize; next <= maxInt64; next += chunkStepSize { + prev := minBoundary + for next := minBoundary + chunkStepSize; next <= maxBoundary; next += chunkStepSize { // condition to protect from infinite loop if next <= prev { logger.Warnf("int precision collapse detected, falling back to SplitViaPrimaryKey for stream %s", stream.ID()) @@ -323,21 +321,21 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo expectedChunks := int64(math.Ceil(float64(approxTableSize) / float64(constants.EffectiveParquetSize))) expectedChunks = utils.Ternary(expectedChunks <= 0, int64(1), expectedChunks).(int64) - chunkDiff := new(big.Int).Sub(&maxEncodedBigIntValue, &minEncodedBigIntValue) - chunkDiff.Add(chunkDiff, new(big.Int).Sub(big.NewInt(expectedChunks), big.NewInt(1))) - chunkDiff.Div(chunkDiff, big.NewInt(expectedChunks)) //ceil division set up + stringChunkStepSize := new(big.Int).Sub(&maxEncodedBigIntValue, &minEncodedBigIntValue) + stringChunkStepSize.Add(stringChunkStepSize, new(big.Int).Sub(big.NewInt(expectedChunks), big.NewInt(1))) + stringChunkStepSize.Div(stringChunkStepSize, big.NewInt(expectedChunks)) //ceil division set up rangeSlice := []string{} // Try up to 5 times to generate balanced chunks by slightly adjusting the chunk size each iteration. for retryAttempt := int64(0); retryAttempt < int64(5); retryAttempt++ { - temporaryChunkDiff := new(big.Int).Set(chunkDiff) - temporaryChunkDiff.Add(temporaryChunkDiff, big.NewInt(retryAttempt)) - temporaryChunkDiff.Div(temporaryChunkDiff, big.NewInt(retryAttempt+1)) + adjustedStepSize := new(big.Int).Set(stringChunkStepSize) + adjustedStepSize.Add(adjustedStepSize, big.NewInt(retryAttempt)) + adjustedStepSize.Div(adjustedStepSize, big.NewInt(retryAttempt+1)) currentBoundary := new(big.Int).Set(&minEncodedBigIntValue) for chunkIdx := int64(0); chunkIdx < expectedChunks*(retryAttempt+1) && currentBoundary.Cmp(&maxEncodedBigIntValue) < 0; chunkIdx++ { rangeSlice = append(rangeSlice, decodeBigIntToUnicodeString(currentBoundary)) - currentBoundary.Add(currentBoundary, temporaryChunkDiff) + currentBoundary.Add(currentBoundary, adjustedStepSize) } // Align boundaries with actual DB values using MySQL collation ordering @@ -444,25 +442,26 @@ func (m *MySQL) getTableExtremes(ctx context.Context, stream types.StreamInterfa } // checks if the pk column is numeric and evenly distributed -func IsNumericAndEvenDistributed(minVal any, maxVal any, approxRowCount int64, chunkSize int64) (int64, int64, int64) { - if exceedsInt64Limits(minVal) || exceedsInt64Limits(maxVal) { - logger.Debugf("minVal or maxVal exceeds int64 limits") +func IsNumericAndEvenDistributed(minVal any, maxVal any, approxRowCount int64, chunkSize int64, dataType string) (int64, int64, int64) { + icebergDataType := mysqlTypeToDataTypes[strings.ToLower(dataType)] + if icebergDataType != types.Int32 && icebergDataType != types.Int64 { + logger.Debugf("Current pk is not a supported numeric column") return 0, 0, 0 } - minInt64, err := typeutils.ReformatInt64(minVal) + minBoundary, err := typeutils.ReformatInt64(minVal) if err != nil { logger.Debugf("failed to parse minVal: %s", err) return 0, 0, 0 } - maxInt64, err := typeutils.ReformatInt64(maxVal) + maxBoundary, err := typeutils.ReformatInt64(maxVal) if err != nil { logger.Debugf("failed to parse maxVal: %s", err) return 0, 0, 0 } - distributionFactor := float64(maxInt64 - minInt64 + 1) / float64(approxRowCount) + distributionFactor := (float64(maxBoundary) - float64(minBoundary) + 1) / float64(approxRowCount) if distributionFactor < constants.DistributionLower || distributionFactor > constants.DistributionUpper { logger.Debugf("distribution factor is not in the range of %f to %f", constants.DistributionLower, constants.DistributionUpper) @@ -470,7 +469,7 @@ func IsNumericAndEvenDistributed(minVal any, maxVal any, approxRowCount int64, c } chunkStepSize := int64(math.Ceil(math.Max(distributionFactor*float64(chunkSize), 1))) - return chunkStepSize, minInt64, maxInt64 + return chunkStepSize, minBoundary, maxBoundary } /* @@ -549,40 +548,3 @@ func padRightWithNulls(s string, maxLength int) string { } return s + strings.Repeat("\x00", maxLength-length) } - -// checks if a value exceeds int64 limits -func exceedsInt64Limits(val any) bool { - switch v := val.(type) { - case json.Number: - if _, err := v.Int64(); err == nil { - return false - } - if floatVal, err := v.Float64(); err == nil { - return floatVal > float64(math.MaxInt64) || floatVal < float64(math.MinInt64) - } - return true - case float64: - return v > float64(math.MaxInt64) || v < float64(math.MinInt64) - case uint64: - return v > math.MaxInt64 - case uint: - return uint64(v) > math.MaxInt64 - case []uint8: - return exceedsInt64Limits(string(v)) - case string: - if _, err := strconv.ParseInt(v, 10, 64); err == nil { - return false - } - if u, err := strconv.ParseUint(v, 10, 64); err == nil { - return u > math.MaxInt64 - } - return true - case *any: - if v != nil { - return exceedsInt64Limits(*v) - } - return false - default: - return false - } -} diff --git a/utils/typeutils/reformat.go b/utils/typeutils/reformat.go index 581490505..22ad78cd5 100644 --- a/utils/typeutils/reformat.go +++ b/utils/typeutils/reformat.go @@ -311,18 +311,6 @@ func ReformatInt64(v any) (int64, error) { case uint64: //nolint:gosec // G115: converting uint64 to int64 is safe for expected ranges return int64(v), nil - case []uint8: - strVal := string(v) - intValue, err := strconv.ParseInt(strVal, 10, 64) - if err == nil { - return intValue, nil - } - uintValue, err := strconv.ParseUint(strVal, 10, 64) - if err != nil { - return int64(0), fmt.Errorf("failed to change []byte %v to int64: %v", v, err) - } - //nolint:gosec // G115: converting []uint8 to int64 is safe and required for backward compatibility - return int64(uintValue), nil case bool: if v { return 1, nil @@ -336,6 +324,19 @@ func ReformatInt64(v any) (int64, error) { return intValue, nil case *any: return ReformatInt64(*v) + case []uint8: + if constants.LoadedStateVersion > 4 { + strVal := string(v) + intValue, err := strconv.ParseInt(strVal, 10, 64) + if err == nil { + return intValue, nil + } + uintValue, err := strconv.ParseUint(strVal, 10, 64) + if err == nil { + //nolint:gosec // G115: converting []uint8 to int64 is safe and required for backward compatibility + return int64(uintValue), nil + } + } } return int64(0), fmt.Errorf("failed to change %v (type:%T) to int64", v, v) From 7754d723da3237b61d8ba994bef8010881e8a916 Mon Sep 17 00:00:00 2001 From: saksham-datazip Date: Wed, 11 Mar 2026 16:01:39 +0530 Subject: [PATCH 23/23] chore: self reviewed --- constants/state_version.go | 2 +- utils/typeutils/reformat.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/constants/state_version.go b/constants/state_version.go index 24afc218d..68b707663 100644 --- a/constants/state_version.go +++ b/constants/state_version.go @@ -26,7 +26,7 @@ package constants // * Earlier if the session timezone or global was set in offset format, it was not parsed correctly and used to fallback to UTC. // * Now it parses the offset correctly and uses the timezone offset to set the timezone for the connection. // -// - Version 4: (Current Version) Unsigned int/integer/bigint map to Int64. +// - Version 4: Unsigned int/integer/bigint map to Int64. // * Earlier unsigned int/integer/bigint were mapped to Int32 which caused integer overflows. // // - Version 5: (Current Version) Added []uint8 (byte slice) support in ReformatInt64 diff --git a/utils/typeutils/reformat.go b/utils/typeutils/reformat.go index 22ad78cd5..f0547d44e 100644 --- a/utils/typeutils/reformat.go +++ b/utils/typeutils/reformat.go @@ -280,7 +280,7 @@ func parseStringTimestamp(value string, isTimestampInDB bool) (time.Time, error) return time.Unix(0, 0).UTC(), nil } -// TODO: Add bytes array handling of int64 and other datatypes. Also add unit test cases for it. +// TODO: Add unit test cases for ReformatInt64 and byte array handling for other datatypes as well. func ReformatInt64(v any) (int64, error) { switch v := v.(type) { case json.Number: