add support for CTID bucketing with snapshotNumPartitionsOverride (#3624)

jgao54 · alon-zeltser-cyera · web-flow · commit ce42571133b3 · 2025-10-28T19:10:43.000Z
PeerDB supports parallel snapshotting to optimize initial load time. To do this there are two ways today: 1) we compute the total row count in a table, bucket the data evenly by watermark column -- this is the default behavior and provides even distribution of data for parallel initial load. However calculating total row count can be slow on large tables. 2) with `SnapshotNumPartitionsOverride` enabled, rather than calculating num partitions, we fetch the min / max values of watermark column, and increment the column evenly to get the partition ranges. In this case we can't guarantee even distribution of data across partitions, but can speed up initial snapshot on large table by bypassing calculating total row count. This change handles the case for 2) when watermark column is not explicitly passed in and defaults to `ctid`, which currently is a no-op. For append-only tables, we expect even distribution of data, so this change should result in a pure performance win. For updatable tables, this may result in uneven distribution of data partitioning, but this is already the case with approach 2). Thank you @alon-zeltser-cyera for the contribution. Separate note: `SnapshotNumPartitionsOverride` was introduced to support use case where num partitions is explicitly provided. There's no reason that it has to be tied to the two initial snapshot bucketing approaches, so we may want to evaluate decoupling the two concept later on if we want to provide this feature more widely. TODO: - [x] Add e2e test - [x] Run test against a large table --------- Co-authored-by: Alon Zeltser <alon.zeltser@cyera.io>
diff --git a/flow/connectors/postgres/qrep.go b/flow/connectors/postgres/qrep.go
@@ -2,9 +2,12 @@ package connpostgres
 
 import (
 	"bytes"
+	"cmp"
 	"context"
+	"errors"
 	"fmt"
 	"log/slog"
+	"math"
 	"strconv"
 	"strings"
 	"text/template"
@@ -26,7 +29,10 @@ import (
 	"github.com/PeerDB-io/peerdb/flow/shared/exceptions"
 )
 
-const qRepMetadataTableName = "_peerdb_query_replication_metadata"
+const (
+	qRepMetadataTableName = "_peerdb_query_replication_metadata"
+	ctidColumnName        = "ctid"
+)
 
 type QRepPullSink interface {
 	Close(error)
@@ -89,7 +95,7 @@ func (c *PostgresConnector) GetDefaultPartitionKeyForTables(
 
 	if supportsTidScans {
 		for _, tm := range input.TableMappings {
-			output.TableDefaultPartitionKeyMapping[tm.SourceTableIdentifier] = "ctid"
+			output.TableDefaultPartitionKeyMapping[tm.SourceTableIdentifier] = ctidColumnName
 		}
 	}
 
@@ -264,6 +270,18 @@ func (c *PostgresConnector) getNumRowsPartitions(
 
 		return partitionHelper.GetPartitions(), nil
 	} else {
+		// Special handling for CTID watermark column when a fixed number of partitions is specified:
+		// Partitions are created by dividing table blocks uniformly.
+		// Note: partition boundaries (block ranges) are uniform, but actual row distribution may be skewed
+		// due to table bloat, deleted tuples, or uneven data distribution across blocks.
+		if config.WatermarkColumn == ctidColumnName {
+			return c.getCTIDBlockPartitions(ctx, tx, *parsedWatermarkTable, numPartitions, last)
+		}
+
+		// Default path for non-CTID watermark column when a fixed number of partitions is specified:
+		// Partitions are created by uniformly splitting the min/max value range.
+		// Note: partition boundaries are uniform, but actual row distribution may be skewed
+		// due to non-uniform data distribution, gaps in the value range, or deleted rows.
 		minmaxQuery := fmt.Sprintf("SELECT MIN(%[2]s),MAX(%[2]s) FROM %[1]s %[3]s",
 			parsedWatermarkTable.String(), quotedWatermarkColumn, whereClause)
 		var row pgx.Row
@@ -295,6 +313,85 @@ func (c *PostgresConnector) getNumRowsPartitions(
 	}
 }
 
+func (c *PostgresConnector) getCTIDBlockPartitions(
+	ctx context.Context,
+	tx pgx.Tx,
+	parsedWatermarkTable utils.SchemaTable,
+	numPartitions int64,
+	last *protos.QRepPartition,
+) ([]*protos.QRepPartition, error) {
+	if numPartitions <= 1 {
+		return nil, errors.New("expect numPartitions to be greater than 1")
+	}
+
+	blocksQuery := "SELECT (pg_relation_size(to_regclass($1)) / current_setting('block_size')::int)::bigint"
+	var totalBlocks pgtype.Int8
+	if err := tx.QueryRow(ctx, blocksQuery, parsedWatermarkTable.String()).Scan(&totalBlocks); err != nil {
+		return nil, fmt.Errorf("failed to get relation blocks: %w", err)
+	}
+	if !totalBlocks.Valid || totalBlocks.Int64 <= 0 {
+		return nil, fmt.Errorf("total blocks: %d, valid: %t", totalBlocks.Int64, totalBlocks.Valid)
+	}
+
+	tidCmp := func(a pgtype.TID, b pgtype.TID) int {
+		if blockCmp := cmp.Compare(a.BlockNumber, b.BlockNumber); blockCmp != 0 {
+			return blockCmp
+		}
+		return cmp.Compare(a.OffsetNumber, b.OffsetNumber)
+	}
+
+	tidInc := func(t pgtype.TID) pgtype.TID {
+		if t.OffsetNumber < math.MaxUint16 {
+			return pgtype.TID{BlockNumber: t.BlockNumber, OffsetNumber: t.OffsetNumber + 1, Valid: true}
+		}
+		return pgtype.TID{BlockNumber: t.BlockNumber + 1, OffsetNumber: 0, Valid: true}
+	}
+
+	tidRangeForPartition := func(partitionIndex int64) (pgtype.TID, pgtype.TID, bool) {
+		blockStart := uint32((partitionIndex * totalBlocks.Int64) / numPartitions)
+		nextPartitionBlockStart := uint32(((partitionIndex + 1) * totalBlocks.Int64) / numPartitions)
+		if nextPartitionBlockStart <= blockStart {
+			return pgtype.TID{}, pgtype.TID{}, false
+		}
+		tidStartInclusive := pgtype.TID{BlockNumber: blockStart, OffsetNumber: 0, Valid: true}
+		tidEndInclusive := pgtype.TID{BlockNumber: nextPartitionBlockStart - 1, OffsetNumber: math.MaxUint16, Valid: true}
+		return tidStartInclusive, tidEndInclusive, true
+	}
+
+	var resumeFrom pgtype.TID
+	if last != nil && last.Range != nil {
+		if lr, ok := last.Range.Range.(*protos.PartitionRange_TidRange); ok {
+			resume := pgtype.TID{BlockNumber: lr.TidRange.End.BlockNumber, OffsetNumber: uint16(lr.TidRange.End.OffsetNumber), Valid: true}
+			resumeFrom = tidInc(resume)
+		} else {
+			c.logger.Warn("Ignoring resume offset because it's not TidRange")
+		}
+	}
+
+	partitionHelper := utils.NewPartitionHelper(c.logger)
+	for i := range numPartitions {
+		start, end, valid := tidRangeForPartition(i)
+		if !valid {
+			continue
+		}
+		if resumeFrom.Valid {
+			if tidCmp(end, resumeFrom) < 0 {
+				continue
+			}
+			if tidCmp(start, resumeFrom) < 0 {
+				start = resumeFrom
+			}
+		}
+		if err := partitionHelper.AddPartition(
+			pgtype.TID{BlockNumber: start.BlockNumber, OffsetNumber: start.OffsetNumber, Valid: true},
+			pgtype.TID{BlockNumber: end.BlockNumber, OffsetNumber: end.OffsetNumber, Valid: true},
+		); err != nil {
+			return nil, fmt.Errorf("failed to add TID partition: %w", err)
+		}
+	}
+	return partitionHelper.GetPartitions(), nil
+}
+
 func (c *PostgresConnector) getMinMaxValues(
 	ctx context.Context,
 	tx pgx.Tx,
diff --git a/flow/e2e/clickhouse_test.go b/flow/e2e/clickhouse_test.go
@@ -3,6 +3,7 @@ package e2e
 import (
 	"embed"
 	"fmt"
+	"math"
 	"math/big"
 	"reflect"
 	"regexp"
@@ -12,6 +13,7 @@ import (
 	"time"
 
 	"github.com/jackc/pgerrcode"
+	"github.com/jackc/pgx/v5/pgtype"
 	"github.com/shopspring/decimal"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -2830,3 +2832,112 @@ func (s ClickHouseSuite) Test_PartitionByExpr() {
 	env.Cancel(s.t.Context())
 	RequireEnvCanceled(s.t, env)
 }
+
+func (s ClickHouseSuite) Test_Partition_By_CTID_With_Num_Partitions_Override() {
+	if _, ok := s.source.(*PostgresSource); !ok {
+		s.t.Skip("only applies to postgres")
+	}
+
+	srcTableName := "test_ctid_block_partitions"
+	srcFullName := s.attachSchemaSuffix(srcTableName)
+	dstTableName := "test_ctid_block_partitions_dst"
+
+	require.NoError(s.t, s.source.Exec(s.t.Context(), fmt.Sprintf(`
+		CREATE TABLE IF NOT EXISTS %s (
+			id SERIAL PRIMARY KEY,
+			name TEXT,
+			age INT,
+			email TEXT,
+			created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+		)
+	`, srcFullName)))
+	numRows := 1000
+	deletedRows := 10
+	for i := 1; i <= numRows; i++ {
+		require.NoError(s.t, s.source.Exec(s.t.Context(), fmt.Sprintf(`
+			INSERT INTO %s (name, age, email) VALUES ('user_%d', %d, 'user_%d@example.com')
+		`, srcFullName, i, 20+(i%50), i)))
+	}
+	for i := 1; i <= numRows; i++ {
+		require.NoError(s.t, s.source.Exec(s.t.Context(), fmt.Sprintf(`
+			UPDATE %s SET age = %d WHERE id = %d
+		`, srcFullName, 30+(i%50), i)))
+	}
+	for i := 1; i <= deletedRows; i++ {
+		require.NoError(s.t, s.source.Exec(s.t.Context(), fmt.Sprintf(`
+			DELETE FROM %s WHERE id = %d
+		`, srcFullName, i)))
+	}
+
+	connectionGen := FlowConnectionGenerationConfig{
+		FlowJobName:      s.attachSuffix("clickhouse_partition_by_ctid"),
+		TableNameMapping: map[string]string{srcFullName: dstTableName},
+		Destination:      s.Peer().Name,
+	}
+	flowConnConfig := connectionGen.GenerateFlowConnectionConfigs(s)
+	flowConnConfig.DoInitialSnapshot = true
+	flowConnConfig.SnapshotNumPartitionsOverride = 3
+
+	tc := NewTemporalClient(s.t)
+	env := ExecutePeerflow(s.t, tc, flowConnConfig)
+
+	SetupCDCFlowStatusQuery(s.t, env, flowConnConfig)
+	EnvWaitForCount(env, s, "wait on initial", dstTableName, "id", numRows-deletedRows)
+
+	require.NoError(s.t, s.source.Exec(s.t.Context(), fmt.Sprintf(`
+			INSERT INTO %s (name, age, email) VALUES ('user_%d', %d, 'user_%d@example.com')
+		`, srcFullName, numRows+1, 25, numRows+1)))
+	EnvWaitForCount(env, s, "wait on cdc", dstTableName, "id", numRows-deletedRows+1)
+
+	rows, err := s.Conn().Query(s.t.Context(),
+		`SELECT partition_start, partition_end FROM peerdb_stats.qrep_partitions WHERE parent_mirror_name = $1
+		ORDER BY
+			CAST(split_part(trim(both '()' from partition_start), ',', 1) AS bigint),
+			CAST(split_part(trim(both '()' from partition_start), ',', 2) AS bigint)`,
+		flowConnConfig.FlowJobName)
+	require.NoError(s.t, err, "failed to query partition ranges")
+	defer rows.Close()
+
+	var partitionRanges []struct{ start, end string }
+	for rows.Next() {
+		var start, end string
+		require.NoError(s.t, rows.Scan(&start, &end), "failed to scan partition range")
+		partitionRanges = append(partitionRanges, struct{ start, end string }{start, end})
+	}
+	require.NoError(s.t, rows.Err())
+	// Verify partition count matches override
+	require.Len(s.t, partitionRanges, 3, "expected exactly 3 partitions to be created with SnapshotNumPartitionsOverride=3")
+
+	// Verify partitions ranges are contiguous (intentionally ignoring `TID.Valid` field for tests)
+	tidParse := func(tidStr string) pgtype.TID {
+		blockStr, offsetStr, found := strings.Cut(tidStr[1:len(tidStr)-1], ",")
+		require.True(s.t, found, "failed to parse block number")
+		block, err := strconv.ParseUint(blockStr, 10, 32)
+		require.NoError(s.t, err, "failed to parse block number")
+		offset, err := strconv.ParseUint(offsetStr, 10, 16)
+		require.NoError(s.t, err, "failed to parse offset number")
+		return pgtype.TID{BlockNumber: uint32(block), OffsetNumber: uint16(offset)}
+	}
+	tidInc := func(t pgtype.TID) pgtype.TID {
+		if t.OffsetNumber < math.MaxUint16 {
+			return pgtype.TID{BlockNumber: t.BlockNumber, OffsetNumber: t.OffsetNumber + 1}
+		}
+		return pgtype.TID{BlockNumber: t.BlockNumber + 1, OffsetNumber: 0}
+	}
+	tidEq := func(t1, t2 pgtype.TID) bool {
+		return t1.BlockNumber == t2.BlockNumber && t1.OffsetNumber == t2.OffsetNumber
+	}
+	for i, pr := range partitionRanges {
+		startTID := tidParse(pr.start)
+		if i > 0 {
+			prevEndTID := tidParse(partitionRanges[i-1].end)
+			require.True(s.t, tidEq(tidInc(prevEndTID), startTID),
+				"partitions not contiguous; partition ranges are %v", partitionRanges)
+		} else {
+			require.True(s.t, tidEq(pgtype.TID{}, startTID))
+		}
+	}
+
+	env.Cancel(s.t.Context())
+	RequireEnvCanceled(s.t, env)
+}