Skip to content

Commit 7d42220

Browse files
committed
cspann: do not duplicate vectors to avoid an empty partition
Previously, the split fixup would sometimes add an interior centroid vector to two different parent partitions. In the past, this was necessary in order to avoid creating an empty partition that could unbalance the K-means tree. But this is no longer necessary; empty partitions are now allowed. So this commit removes that code. Epic: CRDB-42943 Release note: None
1 parent fd3daf9 commit 7d42220

File tree

5 files changed

+112
-77
lines changed

5 files changed

+112
-77
lines changed

pkg/sql/vecindex/cspann/fixup_split.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,15 +1053,6 @@ func (fw *fixupWorker) copyToSplitSubPartitions(
10531053
return err
10541054
}
10551055

1056-
if sourcePartition.Level() != LeafLevel && vectors.Count == 1 {
1057-
// This should have been a merge, not a split, but we're too far into the
1058-
// split operation to back out now, so avoid an empty non-root partition by
1059-
// duplicating the last remaining vector in both partitions.
1060-
rightVectors = leftVectors
1061-
rightChildKeys = leftChildKeys
1062-
rightValueBytes = leftValueBytes
1063-
}
1064-
10651056
rightPartitionKey := sourceState.Target2
10661057
added, err = fw.addToPartition(ctx,
10671058
rightPartitionKey, rightVectors, rightChildKeys, rightValueBytes, rightMetadata)

pkg/sql/vecindex/cspann/index_test.go

Lines changed: 65 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"sync"
1818
"testing"
1919
"time"
20+
"unicode/utf8"
2021

2122
"github.com/cockroachdb/cockroach/pkg/sql/vecindex/cspann"
2223
"github.com/cockroachdb/cockroach/pkg/sql/vecindex/cspann/commontest"
@@ -162,7 +163,30 @@ func (s *testState) LoadIndex(d *datadriven.TestData) string {
162163
s.makeNewIndex(d)
163164

164165
lines := strings.Split(d.Input, "\n")
165-
s.loadIndexFromFormat(s.TreeKey, lines, 0)
166+
167+
// Determine the root level by determining max indent.
168+
maxIndent := 0
169+
for _, line := range lines {
170+
// Lines ending with the "│" rune don't increase the level
171+
if strings.HasSuffix(line, "│") {
172+
continue
173+
}
174+
175+
// Leaf vector lines don't increase the level.
176+
idx := strings.Index(line, "• ")
177+
if idx != -1 {
178+
// Note that the '•' rune is 3 UTF-8 bytes.
179+
firstChar := line[idx+4]
180+
if firstChar < '0' || firstChar > '9' {
181+
// This is a leaf vector.
182+
continue
183+
}
184+
}
185+
186+
maxIndent = max(maxIndent, getIndexFormatIndent(line))
187+
}
188+
189+
s.loadIndexFromFormat(s.TreeKey, lines, cspann.Level(maxIndent+1))
166190

167191
return fmt.Sprintf("Loaded %d vectors.\n", len(s.MemStore.GetAllVectors()))
168192
}
@@ -848,8 +872,8 @@ func (s *testState) parseKeyAndVector(line string) (cspann.KeyBytes, vector.T) {
848872
}
849873

850874
func (s *testState) loadIndexFromFormat(
851-
treeKey cspann.TreeKey, lines []string, indent int,
852-
) (remaining []string, level cspann.Level, centroid vector.T, childKey cspann.ChildKey) {
875+
treeKey cspann.TreeKey, lines []string, level cspann.Level,
876+
) (remaining []string, centroid vector.T, childKey cspann.ChildKey) {
853877
// Ensure line contains "• ", note that the '•' rune is 3 UTF-8 bytes.
854878
idx := strings.Index(lines[0], "• ")
855879
require.NotEqual(s.T, -1, idx)
@@ -863,7 +887,7 @@ func (s *testState) loadIndexFromFormat(
863887
vec := s.parseVector(line)
864888
s.MemStore.InsertVector(keyBytes, vec)
865889
randomized := s.Index.TransformVector(vec, make(vector.T, len(vec)))
866-
return lines[1:], cspann.LeafLevel, randomized, cspann.ChildKey{KeyBytes: keyBytes}
890+
return lines[1:], randomized, cspann.ChildKey{KeyBytes: keyBytes}
867891
}
868892

869893
// This is an interior partition.
@@ -885,33 +909,33 @@ func (s *testState) loadIndexFromFormat(
885909
centroid = s.parseVector(line)
886910
centroid = s.Index.TransformVector(centroid, make(vector.T, len(centroid)))
887911

912+
// Parse any children.
913+
parentIndent := getIndexFormatIndent(lines[0])
888914
lines = lines[1:]
889915

890-
childLevel := cspann.LeafLevel
891916
childVectors := vector.MakeSet(len(centroid))
892917
childKeys := []cspann.ChildKey(nil)
893918

894-
if len(lines) > 0 && strings.HasSuffix(lines[0], "│") {
895-
// There are children, so loop over them.
896-
childIndent := len(lines[0]) - len("│")
897-
for len(lines) > 0 && len(lines[0]) > childIndent {
898-
remainder := lines[0][childIndent:]
899-
if remainder == "│" {
900-
// Skip line.
901-
lines = lines[1:]
902-
continue
903-
} else if strings.HasPrefix(remainder, "├") || strings.HasPrefix(remainder, "└") {
904-
var childVector vector.T
905-
lines, childLevel, childVector, childKey = s.loadIndexFromFormat(treeKey, lines, childIndent)
906-
childVectors.Add(childVector)
907-
childKeys = append(childKeys, childKey)
908-
}
919+
for len(lines) > 0 {
920+
childIndent := getIndexFormatIndent(lines[0])
921+
if childIndent <= parentIndent {
922+
break
923+
}
924+
if strings.HasSuffix(lines[0], "│") {
925+
// Skip line.
926+
lines = lines[1:]
927+
continue
928+
} else {
929+
var childVector vector.T
930+
lines, childVector, childKey = s.loadIndexFromFormat(treeKey, lines, level-1)
931+
childVectors.Add(childVector)
932+
childKeys = append(childKeys, childKey)
909933
}
910934
}
911935

912936
// Always create partition in Ready state so that adds are allowed.
913937
metadata := cspann.PartitionMetadata{
914-
Level: childLevel,
938+
Level: level,
915939
Centroid: centroid,
916940
}
917941
metadata.StateDetails.MakeReady()
@@ -934,7 +958,7 @@ func (s *testState) loadIndexFromFormat(
934958
require.NoError(s.T, err)
935959
}
936960

937-
return lines, childLevel + 1, centroid, cspann.ChildKey{PartitionKey: partitionKey}
961+
return lines, centroid, cspann.ChildKey{PartitionKey: partitionKey}
938962
}
939963

940964
// parsePartitionStateDetails parses a partition state details string in this
@@ -1267,3 +1291,22 @@ func validateIndex(
12671291
})
12681292
require.Empty(t, missingKeys)
12691293
}
1294+
1295+
// getIndexFormatIndent returns the number of indentation levels in the given
1296+
// formatted line. For example, for this snippet:
1297+
//
1298+
// • 1 (0, 0)
1299+
// │
1300+
// ├───• 2 (-9, 18)
1301+
//
1302+
// Line 1 would have indent of 0 and lines 2 and 3 would have indent of 1.
1303+
func getIndexFormatIndent(line string) int {
1304+
// Compute level for line containing the "•" rune.
1305+
idx := strings.Index(line, "• ")
1306+
if idx != -1 {
1307+
return utf8.RuneCountInString(line[:idx]) / 4
1308+
}
1309+
1310+
// Otherwise, this must be a line ending with the "│" rune.
1311+
return utf8.RuneCountInString(line)/4 + 1
1312+
}

pkg/sql/vecindex/cspann/testdata/split-non-root-step.ddt

Lines changed: 1 addition & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -558,43 +558,12 @@ force-split partition-key=2 parent-partition-key=1 steps=7
558558
559559
└───• 5 (5, 6) [Updating:2]
560560

561-
# Next steps should duplicate the last remaining vector in sub-partition #5
562-
# rather than leave it empty.
563-
force-split partition-key=2 parent-partition-key=1 steps=2
564-
----
565-
• 1 (3, 6)
566-
567-
├───• 2 (3, 6) [DrainingForSplit:4,5]
568-
├───• 4 (5, 6) [Updating:2]
569-
│ │
570-
│ └───• 3 (5, 6)
571-
│ │
572-
│ ├───• vec2 (3, 5)
573-
│ ├───• vec3 (6, 8)
574-
│ └───• vec4 (6, 5)
575-
576-
└───• 5 (5, 6) [Updating:2]
577-
578-
└───• 3 (5, 6)
579-
580-
├───• vec2 (3, 5)
581-
├───• vec3 (6, 8)
582-
└───• vec4 (6, 5)
583-
584-
# Finish split. Partition #3 should be referenced by both partition #4 and
585-
# partition #5.
561+
# Target partition #5 should be left empty.
586562
force-split partition-key=2 parent-partition-key=1
587563
----
588564
• 1 (3, 6)
589565
590566
├───• 5 (5, 6)
591-
│ │
592-
│ └───• 3 (5, 6)
593-
│ │
594-
│ ├───• vec2 (3, 5)
595-
│ ├───• vec3 (6, 8)
596-
│ └───• vec4 (6, 5)
597-
598567
└───• 4 (5, 6)
599568
600569
└───• 3 (5, 6)

pkg/sql/vecindex/cspann/testdata/split-root-step.ddt

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -158,19 +158,12 @@ force-split partition-key=1 root=3 steps=5
158158
├───• vec1 (1, 2)
159159
└───• vec2 (4, 3)
160160

161-
# Next steps should duplicate the last remaining vector in partition #1 and also
162-
# add it to partition #4. This will prevent partition #4 from being empty, which
163-
# would be a violation of the constraint that the K-means tree must be balanced.
161+
# Partition #4 should be left empty.
164162
force-split partition-key=1 root=4 steps=1
165163
----
166164
• 4 (2.5, 2.5) [Updating:1]
167-
168-
└───• 2 (2.5, 2.5)
169-
170-
├───• vec1 (1, 2)
171-
└───• vec2 (4, 3)
172165

173-
# Finish the split, with partition #2 a child of both sub-partitions.
166+
# Finish the split.
174167
force-split partition-key=1
175168
----
176169
• 1 (6.8, 4.2)
@@ -183,11 +176,6 @@ force-split partition-key=1
183176
│ └───• vec2 (4, 3)
184177
185178
└───• 4 (2.5, 2.5)
186-
187-
└───• 2 (2.5, 2.5)
188-
189-
├───• vec1 (1, 2)
190-
└───• vec2 (4, 3)
191179

192180
# ----------------------------------------------------------------------
193181
# Search the tree when the root is in splitting states.

pkg/sql/vecindex/cspann/testdata/split.ddt

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -821,3 +821,47 @@ force-split partition-key=3 parent-partition-key=1
821821
822822
├───• vec5 (5, 3)
823823
└───• vec4 (6, 2)
824+
825+
# ----------------------------------------------------------------------
826+
# Empty split target partition. This regresses a bug caused by code that
827+
# was trying to insert partition 4 into both partition 5 and 6, in order
828+
# to avoid creating an empty partition. Instead, partition 4 wasn't
829+
# inserted into either partition, becoming a "lost update". The fix was
830+
# to remove the code entirely, as it's no longer a problem for empty
831+
# partitions to exist.
832+
# ----------------------------------------------------------------------
833+
834+
load-index min-partition-size=1 max-partition-size=4 beam-size=2
835+
• 1 (0, 0)
836+
837+
├───• 2 (-9, 18)
838+
│ │
839+
│ ├───• 3 (-9, 18) [Splitting:5,6]
840+
│ │ │
841+
│ │ └───• 4 (-9, 18)
842+
│ │ │
843+
│ │ ├───• vec39 (-4, 13)
844+
│ │ └───• vec60 (-14, 23)
845+
│ │
846+
│ ├───• 5 (-9, 18) [Updating:3]
847+
│ └───• 6 (4, 3) [Updating:3]
848+
849+
└───• 7 (5, 8)
850+
----
851+
Loaded 2 vectors.
852+
853+
force-split partition-key=3 parent-partition-key=2
854+
----
855+
• 1 (0, 0)
856+
857+
├───• 2 (-9, 18)
858+
│ │
859+
│ ├───• 6 (4, 3)
860+
│ │ │
861+
│ │ └───• 4 (-9, 18)
862+
│ │ │
863+
│ │ ├───• vec39 (-4, 13)
864+
│ │ └───• vec60 (-14, 23)
865+
│ │
866+
│ └───• 5 (-9, 18)
867+
└───• 7 (5, 8)

0 commit comments

Comments
 (0)