Skip to content

Commit 8f03a96

Browse files
committed
mma: improve commentary and test cases for config normalization
Epic: CRDB-55052 Release note: None
1 parent 1f1ee11 commit 8f03a96

File tree

3 files changed

+323
-27
lines changed

3 files changed

+323
-27
lines changed

pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1663,8 +1663,18 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal(
16631663
if rangeMsg.MaybeSpanConfIsPopulated {
16641664
normSpanConfig, err := makeNormalizedSpanConfig(&rangeMsg.MaybeSpanConf, cs.constraintMatcher.interner)
16651665
if err != nil {
1666-
// TODO(kvoli): Should we log as a warning here, or return further back out?
1667-
panic(err)
1666+
if normSpanConfig == nil {
1667+
// TODO: the roachpb.SpanConfig violated the basic requirements
1668+
// documented in the proto. We need to ensure that the checks that
1669+
// happened here are also done when the user set the ZoneConfig, so
1670+
// that we can reject such violations up front. At this point in the
1671+
// code we have no way of continuing, so we panic, but we must ensure
1672+
// we never get here in production for user specified input.
1673+
panic(err)
1674+
} else {
1675+
log.KvDistribution.Warningf(
1676+
ctx, "range r%v span config had errors in normalization: %v", rangeMsg.RangeID, err)
1677+
}
16681678
}
16691679
rs.conf = normSpanConfig
16701680
}

pkg/kv/kvserver/allocator/mmaprototype/constraint.go

Lines changed: 125 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -128,18 +128,87 @@ func (ic internedConstraint) less(b internedConstraint) bool {
128128
// constraints are in increasing order using internedConstraint.less.
129129
type constraintsConj []internedConstraint
130130

131+
// conjunctionRelationship describes the binary relationship between sets of
132+
// stores satisfying two constraint conjunctions. A constraint conjunction
133+
// (constraintsConj) represents all possible sets of stores that satisfy all
134+
// constraints in the conjunction. For example, the conjunction [+region=a,
135+
// +zone=a1] represents all possible sets of stores located in region "a" and
136+
// zone "a1".
137+
//
138+
// This relationship (between two constraintsConj denoted A and B in their
139+
// enum comments) is computed purely by comparing the conjuncts of A and B,
140+
// without knowledge of which stores actually exist in the cluster. In some
141+
// cases, specifically conjEqualSet, conjStrictSubset, conjStrictSuperset,
142+
// these can be understood to hold for any possible concrete set of stores. In
143+
// other cases, specifically conjNonIntersecting and conjPossiblyIntersecting,
144+
// these are a best-effort guess based on the structure of the conjuncts
145+
// alone.
146+
//
147+
// The best-effort relationship computation assumes that if two conjuncts are
148+
// not equal their sets are non-overlapping. This simplifying assumption is ok
149+
// since this relationship feeds into a best-effort structural normalization.
131150
type conjunctionRelationship int
132151

133-
// Relationship between conjunctions used for structural normalization. This
134-
// relationship is solely defined based on the conjunctions, and not based on
135-
// what stores actually match. It simply assumes that if two conjuncts are not
136-
// equal their sets are non-overlapping. This simplifying assumption is made
137-
// since we are only trying to do a best-effort structural normalization.
138152
const (
139-
conjIntersecting conjunctionRelationship = iota
153+
// conjPossiblyIntersecting: This is an indeterminate case: The structural
154+
// relationship alone cannot determine how the store sets relate; the actual
155+
// set of stores in the cluster would need to be known. The store sets may
156+
// be disjoint, overlapping, or even have a subset relationship, depending
157+
// on which stores actually exist. In practice, this is the case in which A
158+
// and B share some constraints, but differ in others.
159+
//
160+
// Example: A=[+region=a, +zone=a1], B=[+region=a, -zone=a2].
161+
// - if all stores are in a2, B is empty and thus a subset of A.
162+
// - if all stores are in a1, A and B are equal.
163+
// - if only a1 and a2 have stores, then -zone=a2 is equivalent to +zone=a1,
164+
// so A and B are equal.
165+
// - if only a2 and a3 have stores, then A is empty and thus a strict
166+
// subset of B, which is not empty.
167+
// - if a1, a2, and a3 have stores, then A and B intersect but are not equal:
168+
// - stores in a1 match both A and B.
169+
// - stores in a2 match neither constraint
170+
// - stores in a3 match only B
171+
// and thus B is a strict subset of A.
172+
// Note how the relationship between A and B varies based on the actual set of
173+
// stores: sometimes B contains A, and sometimes the other way around.
174+
conjPossiblyIntersecting conjunctionRelationship = iota
175+
176+
// conjEqualSet: A and B have identical constraint lists, so they represent
177+
// the same set of stores.
178+
//
179+
// Example: A=[+region=a, +zone=a1], B=A
140180
conjEqualSet
181+
182+
// conjStrictSubset: A has more constraints than B, so A's store set is a
183+
// strict subset of B's store set. Every store matching A also matches B,
184+
// but some stores matching B do not match A.
185+
//
186+
// Example: A=[+region=a, +zone=a1], B=[+region=a]
187+
// A matches stores in region=a AND zone=a1.
188+
// B matches all stores in region=a (any zone).
189+
// Therefore, A's store set ⊆ B's store set.
141190
conjStrictSubset
191+
192+
// conjStrictSuperset: A has fewer constraints than B, so A's store set is a
193+
// strict superset of B's store set. Every store matching B also matches A,
194+
// but some stores matching A do not match B.
195+
//
196+
// Example: A=[+region=a], B=[+region=a, +zone=a1]
197+
// A matches all stores in region=a.
198+
// B matches only stores in region=a AND zone=a1.
199+
// Therefore, A's store set ⊇ B's store set.
142200
conjStrictSuperset
201+
202+
// conjNonIntersecting: This is another indeterminate case. A and B have
203+
// no constraints in common, so we assume their store sets are disjoint
204+
// (no store can match both).
205+
//
206+
// Example: A=[+region=a], B=[+region=b]
207+
// A matches stores in region=a, B matches stores in region=b.
208+
// Since a store cannot be in both regions, the sets are disjoint.
209+
// Example: A=[+region=a], B=[+zone=a1]
210+
// If zone=a1 happens to be in region=a, then the disjoint result is
211+
// not correct.
143212
conjNonIntersecting
144213
)
145214

@@ -191,7 +260,7 @@ func (cc constraintsConj) relationship(b constraintsConj) conjunctionRelationshi
191260
}
192261
// (extraInB > 0 && extraInCC > 0)
193262
if inBoth > 0 {
194-
return conjIntersecting
263+
return conjPossiblyIntersecting
195264
}
196265
return conjNonIntersecting
197266
}
@@ -220,6 +289,19 @@ type internedLeasePreference struct {
220289
// SpanConfig for which we don't have a normalized value. The rest of the
221290
// allocator code works with normalizedSpanConfig. Due to the infrequent
222291
// nature of this, we don't attempt to reduce memory allocations.
292+
//
293+
// It does:
294+
//
295+
// - Basic normalization of the constraints and voterConstraints, to specify
296+
// the number of replicas for every constraints conjunction, and ensure that
297+
// the sum adds up to the required number of replicas. These are
298+
// requirements documented in roachpb.SpanConfig. If this basic
299+
// normalization fails, it returns a nil normalizedSpanConfig and an error.
300+
//
301+
// - Structural normalization: see comment in doStructuralNormalization. An
302+
// error in this step causes it to return a non-nil normalizedSpanConfig,
303+
// with an error, so that the error can be surfaced somehow while keeping
304+
// the system running.
223305
func makeNormalizedSpanConfig(
224306
conf *roachpb.SpanConfig, interner *stringInterner,
225307
) (*normalizedSpanConfig, error) {
@@ -263,9 +345,16 @@ func makeNormalizedSpanConfig(
263345
leasePreferences: lps,
264346
interner: interner,
265347
}
266-
return doStructuralNormalization(nConf)
348+
err = doStructuralNormalization(nConf)
349+
return nConf, err
267350
}
268351

352+
// normalizeConstraints normalizes and interns the given constraints. Every
353+
// internedConstraintsConjunction has numReplicas > 0, and the sum of these
354+
// equals the parameter numReplicas.
355+
//
356+
// It returns an error if the input constraints don't satisfy the requirements
357+
// documented in roachpb.SpanConfig related to NumReplicas.
269358
func normalizeConstraints(
270359
constraints []roachpb.ConstraintsConjunction, numReplicas int32, interner *stringInterner,
271360
) ([]internedConstraintsConjunction, error) {
@@ -322,13 +411,14 @@ func normalizeConstraints(
322411
// "strictness" comment in roachpb.SpanConfig which now requires users to
323412
// repeat the information).
324413
//
325-
// This function does some structural normalization even when returning an
326-
// error. See the under-specified voter constraint examples in the datadriven
327-
// test -- we sometimes see these in production settings, and we want to fix
328-
// ones that we can, and raise an error for users to fix their configs.
329-
func doStructuralNormalization(conf *normalizedSpanConfig) (*normalizedSpanConfig, error) {
414+
// This function mutates the conf argument. It does some structural
415+
// normalization even when returning an error. See the under-specified voter
416+
// constraint examples in the datadriven test -- we sometimes see these in
417+
// production settings, and we want to fix ones that we can, and raise an
418+
// error for users to fix their configs.
419+
func doStructuralNormalization(conf *normalizedSpanConfig) error {
330420
if len(conf.constraints) == 0 || len(conf.voterConstraints) == 0 {
331-
return conf, nil
421+
return nil
332422
}
333423
// Relationships between each voter constraint and each all replica
334424
// constraint.
@@ -359,9 +449,25 @@ func doStructuralNormalization(conf *normalizedSpanConfig) (*normalizedSpanConfi
359449
slices.SortFunc(rels, func(a, b relationshipVoterAndAll) int {
360450
return cmp.Compare(a.voterAndAllRel, b.voterAndAllRel)
361451
})
362-
// First are the intersecting constraints, which cause an error.
452+
// First are the intersecting constraints, which cause an error (though we
453+
// proceed with normalization). This is because when there are both shared
454+
// and non shared conjuncts, we cannot be certain that the conjunctions are
455+
// non-intersecting. When the conjunctions are intersecting, we cannot
456+
// promote from one to the other to fill out the set of conjunctions.
457+
//
458+
// Example 1: +region=a,+zone=a1 and +region=a,+zone=a2 are classified as
459+
// conjPossiblyIntersecting, but we could do better in knowing that the
460+
// conjunctions are non-intersecting since zone=a1 and zone=a2 are disjoint.
461+
//
462+
// TODO(sumeer): improve the case of example 1.
463+
//
464+
// Example 2: +region=a,+zone=a1 and +region=a,-zone=a2 are classified as
465+
// conjPossiblyIntersecting. And if there happens to be a zone=a3 in the
466+
// region, they are actually intersecting. We cannot do better since we
467+
// don't know the semantics of regions, zones or the universe of possible
468+
// values of the zone.
363469
index := 0
364-
for rels[index].voterAndAllRel == conjIntersecting {
470+
for rels[index].voterAndAllRel == conjPossiblyIntersecting {
365471
index++
366472
}
367473
var err error
@@ -583,9 +689,9 @@ func doStructuralNormalization(conf *normalizedSpanConfig) (*normalizedSpanConfi
583689
slices.SortFunc(rels, func(a, b relationshipVoterAndAll) int {
584690
return cmp.Compare(a.voterAndAllRel, b.voterAndAllRel)
585691
})
586-
// Ignore conjIntersecting.
692+
// Ignore conjPossiblyIntersecting.
587693
index = 0
588-
for rels[index].voterAndAllRel == conjIntersecting {
694+
for rels[index].voterAndAllRel == conjPossiblyIntersecting {
589695
index++
590696
}
591697
voterConstraintHasEqualityWithConstraint := make([]bool, len(conf.voterConstraints))
@@ -647,7 +753,7 @@ func doStructuralNormalization(conf *normalizedSpanConfig) (*normalizedSpanConfi
647753
}
648754
}
649755

650-
return conf, err
756+
return err
651757
}
652758

653759
type replicaKindIndex int32

0 commit comments

Comments
 (0)