datazip-inc · saksham-datazip · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026 · Feb 2, 2026
diff --git a/constants/constants.go b/constants/constants.go
@@ -34,6 +34,10 @@ const (
 	EffectiveParquetSize        = int64(256) * 1024 * 1024 * int64(8)
 	DB2StateTimestampFormat     = "2006-01-02 15:04:05.000000"
 	DefaultStateTimestampFormat = "2006-01-02T15:04:05.000000000Z"
+	// DistributionLower is the lower bound for distribution factor
+	DistributionLower = 0.05
+	// DistributionUpper is the upper bound for distribution factor
+	DistributionUpper = 100.0
 )
 
 type DriverType string

diff --git a/drivers/mysql/internal/backfill.go b/drivers/mysql/internal/backfill.go
@@ -41,15 +41,16 @@ func (m *MySQL) ChunkIterator(ctx context.Context, stream types.StreamInterface,
 		sort.Strings(pkColumns)
 
 		logger.Debugf("Starting backfill from %v to %v with filter: %s, args: %v", chunk.Min, chunk.Max, filter, args)
-		// Get chunks from state or calculate new ones
-		stmt := ""
+		// Get cxhunks from state or calculate new ones
+		var stmt string
 		if chunkColumn != "" {
 			stmt = jdbc.MysqlChunkScanQuery(stream, []string{chunkColumn}, chunk, filter)
 		} else if len(pkColumns) > 0 {
 			stmt = jdbc.MysqlChunkScanQuery(stream, pkColumns, chunk, filter)
 		} else {
 			stmt = jdbc.MysqlLimitOffsetScanQuery(stream, chunk, filter)
 		}
+
 		logger.Debugf("Executing chunk query: %s", stmt)
 		setter := jdbc.NewReader(ctx, stmt, func(ctx context.Context, query string, queryArgs ...any) (*sql.Rows, error) {
 			return tx.QueryContext(ctx, query, args...)
@@ -93,21 +94,39 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo
 	chunkSize := int64(math.Ceil(float64(constants.EffectiveParquetSize) / avgRowSizeFloat))
 	chunks := types.NewSet[types.Chunk]()
 	chunkColumn := stream.Self().StreamMetadata.ChunkColumn
+
+	var (
+		isEvenDistribution bool
+		step               int64
+		minVal             any			//to define lower range of the chunk
+		maxVal             any			//to define upper range of the chunk
+		minFloat           float64
+		maxFloat           float64
+	)
+
+	pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array()
+	if chunkColumn != "" {
+		pkColumns = []string{chunkColumn}
+	}
+	sort.Strings(pkColumns)
+
+	if stream.GetStream().SourceDefinedPrimaryKey.Len() > 0 || chunkColumn != "" {
+		err = jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error {
+			var err error
+			minVal, maxVal, err = m.getTableExtremes(ctx, stream, pkColumns, tx)
+			return err
+		})
+		if err != nil {
+			return nil, fmt.Errorf("failed to get table extremes: %s", err)
+		}
+	}
+	if len(pkColumns) == 1 {
+		isEvenDistribution, step, minFloat, maxFloat = shouldUseEvenDistribution(minVal, maxVal, approxRowCount, chunkSize)
+	}
 	// Takes the user defined batch size as chunkSize
 	// TODO: common-out the chunking logic for db2, mssql, mysql
 	splitViaPrimaryKey := func(stream types.StreamInterface, chunks *types.Set[types.Chunk]) error {
 		return jdbc.WithIsolation(ctx, m.client, true, func(tx *sql.Tx) error {
-			// Get primary key column using the provided function
-			pkColumns := stream.GetStream().SourceDefinedPrimaryKey.Array()
-			if chunkColumn != "" {
-				pkColumns = []string{chunkColumn}
-			}
-			sort.Strings(pkColumns)
-			// Get table extremes
-			minVal, maxVal, err := m.getTableExtremes(ctx, stream, pkColumns, tx)
-			if err != nil {
-				return fmt.Errorf("failed to get table extremes: %s", err)
-			}
 			if minVal == nil {
 				return nil
 			}
@@ -180,7 +199,31 @@ func (m *MySQL) GetOrSplitChunks(ctx context.Context, pool *destination.WriterPo
 		})
 	}
 
-	if stream.GetStream().SourceDefinedPrimaryKey.Len() > 0 || chunkColumn != "" {
+	//used mathematical calculation to split the chunks for cases where the distribution factor is within the range
+	splitEvenlyForInt := func(minf, maxf float64, chunks *types.Set[types.Chunk], step float64) {
+		if minf+step > maxf {
+			chunks.Insert(types.Chunk{
+				Min: nil,
+				Max: nil,
+			})
+			return
+		}
+		prev := minf
+		for next := minf + step; next <= maxf; next += step {
+			chunks.Insert(types.Chunk{
+				Min: utils.ConvertToString(prev),
+				Max: utils.ConvertToString(next),
+			})
+			prev = next
+		}
+		chunks.Insert(types.Chunk{
+			Min: utils.ConvertToString(prev),
+			Max: nil,
+		})
+	}
+	if len(pkColumns) == 1 && isEvenDistribution {
+		splitEvenlyForInt(minFloat, maxFloat, chunks, float64(step))
+	} else if len(pkColumns) > 0 {
 		err = splitViaPrimaryKey(stream, chunks)
 	} else {
 		err = limitOffsetChunking(chunks)
@@ -193,3 +236,20 @@ func (m *MySQL) getTableExtremes(ctx context.Context, stream types.StreamInterfa
 	err = tx.QueryRowContext(ctx, query).Scan(&min, &max)
 	return min, max, err
 }
+
+func shouldUseEvenDistribution(minVal any, maxVal any, approxRowCount int64, chunkSize int64) (bool, int64, float64, float64) {
+	if approxRowCount == 0 {
+		return false, 0, 0, 0
+	}
+	minFloat, err1 := typeutils.ReformatFloat64(minVal)
+	maxFloat, err2 := typeutils.ReformatFloat64(maxVal)
+	if err1 != nil || err2 != nil {
+		return false, 0, 0, 0
+	}
+	distributionFactor := (maxFloat - minFloat + 1) / float64(approxRowCount)
+	if distributionFactor < constants.DistributionLower || distributionFactor > constants.DistributionUpper {
+		return false, 0, 0, 0
+	}
+	step := int64(math.Max(distributionFactor*float64(chunkSize), 1))
+	return true, step, minFloat, maxFloat
+}
diff --git a/pkg/jdbc/jdbc.go b/pkg/jdbc/jdbc.go
@@ -429,6 +429,9 @@ func MysqlLimitOffsetScanQuery(stream types.StreamInterface, chunk types.Chunk,
 func MysqlChunkScanQuery(stream types.StreamInterface, filterColumns []string, chunk types.Chunk, extraFilter string) string {
 	condition := buildChunkConditionMySQL(filterColumns, chunk, extraFilter)
 	quotedTable := QuoteTable(stream.Namespace(), stream.Name(), constants.MySQL)
+	if condition == "" {
+		condition = utils.Ternary(extraFilter != "", extraFilter, "1 = 1").(string)
+	}
 	return fmt.Sprintf("SELECT * FROM %s WHERE %s", quotedTable, condition)
 }