@@ -8,6 +8,7 @@ package ttljob
8
8
import (
9
9
"context"
10
10
"fmt"
11
+ "sync/atomic"
11
12
"testing"
12
13
13
14
"github.com/cockroachdb/cockroach/pkg/base"
@@ -155,3 +156,192 @@ func TestTTLProgressLifecycle(t *testing.T) {
155
156
require .Equal (t , int64 (1000 ), ttlProgress .JobDeletedRowCount )
156
157
require .Len (t , ttlProgress .ProcessorProgresses , 2 )
157
158
}
159
+
160
+ func TestReplanDecider (t * testing.T ) {
161
+ defer leaktest .AfterTest (t )()
162
+ defer log .Scope (t ).Close (t )
163
+
164
+ testCases := []struct {
165
+ desc string
166
+ beforeNodes []base.SQLInstanceID
167
+ afterNodes []base.SQLInstanceID
168
+ threshold float64
169
+ expectReplan bool
170
+ }{
171
+ {
172
+ desc : "nodes don't change" ,
173
+ beforeNodes : []base.SQLInstanceID {1 , 2 , 3 },
174
+ afterNodes : []base.SQLInstanceID {1 , 2 , 3 },
175
+ threshold : 0.1 ,
176
+ expectReplan : false ,
177
+ },
178
+ {
179
+ desc : "one node is shutdown" ,
180
+ beforeNodes : []base.SQLInstanceID {1 , 2 , 3 },
181
+ afterNodes : []base.SQLInstanceID {1 , 3 },
182
+ threshold : 0.1 ,
183
+ expectReplan : true ,
184
+ },
185
+ {
186
+ desc : "one node is brought online" ,
187
+ beforeNodes : []base.SQLInstanceID {1 , 2 , 3 },
188
+ afterNodes : []base.SQLInstanceID {1 , 2 , 3 , 4 },
189
+ threshold : 0.1 ,
190
+ expectReplan : false ,
191
+ },
192
+ {
193
+ desc : "one node is replaced" ,
194
+ beforeNodes : []base.SQLInstanceID {1 , 2 , 3 },
195
+ afterNodes : []base.SQLInstanceID {1 , 2 , 4 },
196
+ threshold : 0.1 ,
197
+ expectReplan : true ,
198
+ },
199
+ {
200
+ desc : "multiple nodes shutdown" ,
201
+ beforeNodes : []base.SQLInstanceID {1 , 2 , 3 , 4 , 5 },
202
+ afterNodes : []base.SQLInstanceID {1 , 3 },
203
+ threshold : 0.1 ,
204
+ expectReplan : true ,
205
+ },
206
+ {
207
+ desc : "all nodes replaced" ,
208
+ beforeNodes : []base.SQLInstanceID {1 , 2 , 3 },
209
+ afterNodes : []base.SQLInstanceID {4 , 5 , 6 },
210
+ threshold : 0.1 ,
211
+ expectReplan : true ,
212
+ },
213
+ {
214
+ desc : "threshold boundary: exactly at threshold" ,
215
+ beforeNodes : []base.SQLInstanceID {1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 },
216
+ afterNodes : []base.SQLInstanceID {1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 },
217
+ threshold : 0.1 ,
218
+ expectReplan : false ,
219
+ },
220
+ {
221
+ desc : "threshold boundary: just above threshold" ,
222
+ beforeNodes : []base.SQLInstanceID {1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 },
223
+ afterNodes : []base.SQLInstanceID {1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 },
224
+ threshold : 0.1 ,
225
+ expectReplan : true ,
226
+ },
227
+ {
228
+ desc : "threshold disabled" ,
229
+ beforeNodes : []base.SQLInstanceID {1 , 2 , 3 },
230
+ afterNodes : []base.SQLInstanceID {1 , 2 },
231
+ threshold : 0.0 ,
232
+ expectReplan : false ,
233
+ },
234
+ {
235
+ desc : "large scale: many nodes lost" ,
236
+ beforeNodes : []base.SQLInstanceID {1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 },
237
+ afterNodes : []base.SQLInstanceID {1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 },
238
+ threshold : 0.1 ,
239
+ expectReplan : true ,
240
+ },
241
+ {
242
+ desc : "mixed scenario: nodes added and removed" ,
243
+ beforeNodes : []base.SQLInstanceID {1 , 2 , 3 , 4 , 5 },
244
+ afterNodes : []base.SQLInstanceID {1 , 3 , 5 , 6 , 7 , 8 },
245
+ threshold : 0.1 ,
246
+ expectReplan : true ,
247
+ },
248
+ }
249
+
250
+ for _ , testCase := range testCases {
251
+ t .Run (testCase .desc , func (t * testing.T ) {
252
+ // Create atomic counter and set stability window to 1 for immediate replan (current behavior)
253
+ consecutiveReplanDecisions := & atomic.Int64 {}
254
+ decider := replanDecider (consecutiveReplanDecisions , func () int64 { return 1 }, func () float64 { return testCase .threshold })
255
+ ctx := context .Background ()
256
+ oldPlan := & sql.PhysicalPlan {}
257
+ oldPlan .PhysicalInfrastructure = & physicalplan.PhysicalInfrastructure {Processors : nil }
258
+ for _ , nodeID := range testCase .beforeNodes {
259
+ oldPlan .Processors = append (oldPlan .Processors , physicalplan.Processor {SQLInstanceID : nodeID })
260
+ }
261
+ newPlan := & sql.PhysicalPlan {}
262
+ newPlan .PhysicalInfrastructure = & physicalplan.PhysicalInfrastructure {Processors : nil }
263
+ for _ , nodeID := range testCase .afterNodes {
264
+ newPlan .Processors = append (newPlan .Processors , physicalplan.Processor {SQLInstanceID : nodeID })
265
+ }
266
+ replan := decider (ctx , oldPlan , newPlan )
267
+ require .Equal (t , testCase .expectReplan , replan )
268
+ })
269
+ }
270
+ }
271
+
272
+ func TestReplanDeciderStabilityWindow (t * testing.T ) {
273
+ defer leaktest .AfterTest (t )()
274
+ defer log .Scope (t ).Close (t )
275
+
276
+ testCases := []struct {
277
+ desc string
278
+ stabilityWindow int64
279
+ threshold float64
280
+ planChanges [][]base.SQLInstanceID // sequence of plan changes
281
+ expectedReplans []bool // expected replan decision for each change
282
+ }{
283
+ {
284
+ desc : "stability window 1 - immediate replan" ,
285
+ stabilityWindow : 1 ,
286
+ threshold : 0.1 ,
287
+ planChanges : [][]base.SQLInstanceID {{2 , 3 }, {2 , 4 }, {3 , 4 }},
288
+ expectedReplans : []bool {true , true , true },
289
+ },
290
+ {
291
+ desc : "stability window 2 - requires consecutive decisions" ,
292
+ stabilityWindow : 2 ,
293
+ threshold : 0.1 ,
294
+ planChanges : [][]base.SQLInstanceID {{2 , 3 }, {2 , 4 }, {1 , 2 , 3 }},
295
+ expectedReplans : []bool {false , true , false }, // first false, second true (meets window), third false (reset)
296
+ },
297
+ {
298
+ desc : "stability window 2 - interrupted sequence" ,
299
+ stabilityWindow : 2 ,
300
+ threshold : 0.1 ,
301
+ planChanges : [][]base.SQLInstanceID {{2 , 3 }, {1 , 2 , 3 }, {2 , 4 }, {3 , 4 }},
302
+ expectedReplans : []bool {false , false , false , true }, // interrupted, then consecutive
303
+ },
304
+ {
305
+ desc : "stability window 3 - three consecutive needed" ,
306
+ stabilityWindow : 3 ,
307
+ threshold : 0.1 ,
308
+ planChanges : [][]base.SQLInstanceID {{2 , 3 }, {2 , 4 }, {3 , 4 }, {1 , 2 , 3 }},
309
+ expectedReplans : []bool {false , false , true , false }, // third one triggers replan
310
+ },
311
+ }
312
+
313
+ for _ , testCase := range testCases {
314
+ t .Run (testCase .desc , func (t * testing.T ) {
315
+ consecutiveReplanDecisions := & atomic.Int64 {}
316
+ decider := replanDecider (
317
+ consecutiveReplanDecisions ,
318
+ func () int64 { return testCase .stabilityWindow },
319
+ func () float64 { return testCase .threshold },
320
+ )
321
+ ctx := context .Background ()
322
+
323
+ // Use initial plan with nodes 1,2,3
324
+ initialPlan := & sql.PhysicalPlan {}
325
+ initialPlan .PhysicalInfrastructure = & physicalplan.PhysicalInfrastructure {Processors : nil }
326
+ for _ , nodeID := range []base.SQLInstanceID {1 , 2 , 3 } {
327
+ initialPlan .Processors = append (initialPlan .Processors , physicalplan.Processor {SQLInstanceID : nodeID })
328
+ }
329
+
330
+ for i , nodes := range testCase .planChanges {
331
+ newPlan := & sql.PhysicalPlan {}
332
+ newPlan .PhysicalInfrastructure = & physicalplan.PhysicalInfrastructure {Processors : nil }
333
+ for _ , nodeID := range nodes {
334
+ newPlan .Processors = append (newPlan .Processors , physicalplan.Processor {SQLInstanceID : nodeID })
335
+ }
336
+
337
+ replan := decider (ctx , initialPlan , newPlan )
338
+ if replan != testCase .expectedReplans [i ] {
339
+ t .Errorf ("step %d: expected replan=%v, got %v (consecutive count: %d)" , i , testCase .expectedReplans [i ], replan , consecutiveReplanDecisions .Load ())
340
+ }
341
+
342
+ // Update initial plan for next iteration to maintain state
343
+ initialPlan = newPlan
344
+ }
345
+ })
346
+ }
347
+ }
0 commit comments