24
24
import org .apache .flink .kubernetes .operator .api .bluegreen .DeploymentType ;
25
25
import org .apache .flink .kubernetes .operator .api .lifecycle .ResourceLifecycleState ;
26
26
import org .apache .flink .kubernetes .operator .api .spec .FlinkBlueGreenDeploymentSpec ;
27
+ import org .apache .flink .kubernetes .operator .api .spec .JobState ;
27
28
import org .apache .flink .kubernetes .operator .api .status .FlinkBlueGreenDeploymentState ;
28
29
import org .apache .flink .kubernetes .operator .api .status .FlinkBlueGreenDeploymentStatus ;
29
30
import org .apache .flink .kubernetes .operator .api .status .Savepoint ;
61
62
import java .util .stream .Collectors ;
62
63
import java .util .stream .Stream ;
63
64
64
- import static io .javaoperatorsdk .operator .api .reconciler .UpdateControl .noUpdate ;
65
-
66
65
/** Controller that runs the main reconcile loop for Flink Blue/Green deployments. */
67
66
@ ControllerConfiguration
68
67
public class FlinkBlueGreenDeploymentController
69
68
implements Reconciler <FlinkBlueGreenDeployment >,
70
69
EventSourceInitializer <FlinkBlueGreenDeployment > {
71
70
72
71
private static final Logger LOG = LoggerFactory .getLogger (FlinkDeploymentController .class );
73
- private static final int DEFAULT_MAX_NUM_RETRIES = 10 ;
74
- private static final int DEFAULT_RECONCILIATION_RESCHEDULING_INTERVAL_MS = 15000 ;
72
+ private static final int DEFAULT_RECONCILIATION_RESCHEDULING_INTERVAL_MS = 15000 ; // 15 secs
75
73
76
74
private final FlinkResourceContextFactory ctxFactory ;
77
75
76
+ public static int minimumAbortGracePeriodMs = 120000 ; // 2 mins
77
+
78
78
public FlinkBlueGreenDeploymentController (FlinkResourceContextFactory ctxFactory ) {
79
79
this .ctxFactory = ctxFactory ;
80
80
}
@@ -95,17 +95,16 @@ public Map<String, EventSource> prepareEventSources(
95
95
96
96
@ Override
97
97
public UpdateControl <FlinkBlueGreenDeployment > reconcile (
98
- FlinkBlueGreenDeployment flinkBlueGreenDeployment ,
99
- Context <FlinkBlueGreenDeployment > josdkContext )
98
+ FlinkBlueGreenDeployment bgDeployment , Context <FlinkBlueGreenDeployment > josdkContext )
100
99
throws Exception {
101
100
102
- FlinkBlueGreenDeploymentStatus deploymentStatus = flinkBlueGreenDeployment .getStatus ();
101
+ FlinkBlueGreenDeploymentStatus deploymentStatus = bgDeployment .getStatus ();
103
102
104
103
if (deploymentStatus == null ) {
105
104
deploymentStatus = new FlinkBlueGreenDeploymentStatus ();
106
- setLastReconciledSpec (flinkBlueGreenDeployment , deploymentStatus );
105
+ setLastReconciledSpec (bgDeployment , deploymentStatus );
107
106
return initiateDeployment (
108
- flinkBlueGreenDeployment ,
107
+ bgDeployment ,
109
108
deploymentStatus ,
110
109
DeploymentType .BLUE ,
111
110
FlinkBlueGreenDeploymentState .TRANSITIONING_TO_BLUE ,
@@ -119,38 +118,46 @@ public UpdateControl<FlinkBlueGreenDeployment> reconcile(
119
118
switch (deploymentStatus .getBlueGreenState ()) {
120
119
case ACTIVE_BLUE :
121
120
return checkAndInitiateDeployment (
122
- flinkBlueGreenDeployment ,
121
+ bgDeployment ,
123
122
deployments ,
124
123
deploymentStatus ,
125
124
DeploymentType .BLUE ,
126
125
josdkContext );
127
126
case ACTIVE_GREEN :
128
127
return checkAndInitiateDeployment (
129
- flinkBlueGreenDeployment ,
128
+ bgDeployment ,
130
129
deployments ,
131
130
deploymentStatus ,
132
131
DeploymentType .GREEN ,
133
132
josdkContext );
134
133
case TRANSITIONING_TO_BLUE :
135
134
return monitorTransition (
136
- flinkBlueGreenDeployment ,
135
+ bgDeployment ,
137
136
deployments ,
138
137
deploymentStatus ,
139
138
DeploymentType .GREEN ,
140
139
josdkContext );
141
140
case TRANSITIONING_TO_GREEN :
142
141
return monitorTransition (
143
- flinkBlueGreenDeployment ,
142
+ bgDeployment ,
144
143
deployments ,
145
144
deploymentStatus ,
146
145
DeploymentType .BLUE ,
147
146
josdkContext );
148
147
default :
149
- return noUpdate ();
148
+ return UpdateControl . noUpdate ();
150
149
}
151
150
}
152
151
}
153
152
153
+ private static void setAbortTimestamp (
154
+ FlinkBlueGreenDeployment bgDeployment ,
155
+ FlinkBlueGreenDeploymentStatus deploymentStatus ) {
156
+ int abortGracePeriod = bgDeployment .getSpec ().getTemplate ().getAbortGracePeriodMs ();
157
+ abortGracePeriod = Math .max (abortGracePeriod , minimumAbortGracePeriodMs );
158
+ deploymentStatus .setAbortTimestamp (System .currentTimeMillis () + abortGracePeriod );
159
+ }
160
+
154
161
private UpdateControl <FlinkBlueGreenDeployment > monitorTransition (
155
162
FlinkBlueGreenDeployment bgDeployment ,
156
163
FlinkBlueGreenDeployments deployments ,
@@ -159,7 +166,7 @@ private UpdateControl<FlinkBlueGreenDeployment> monitorTransition(
159
166
Context <FlinkBlueGreenDeployment > josdkContext )
160
167
throws JsonProcessingException {
161
168
162
- if (hasSpecChanged (bgDeployment .getSpec (), deploymentStatus , currentDeploymentType )) {
169
+ if (hasSpecChanged (bgDeployment .getSpec (), deploymentStatus )) {
163
170
// this means the spec was changed during transition,
164
171
// ignore the new change, revert the spec and log as warning
165
172
bgDeployment .setSpec (
@@ -196,7 +203,7 @@ private UpdateControl<FlinkBlueGreenDeployment> monitorTransition(
196
203
return canDelete (
197
204
bgDeployment , deploymentStatus , josdkContext , currentDeployment , nextState );
198
205
} else {
199
- return retryOrAbort (
206
+ return shouldAbort (
200
207
bgDeployment , deploymentStatus , josdkContext , nextDeployment , nextState );
201
208
}
202
209
}
@@ -234,41 +241,56 @@ private UpdateControl<FlinkBlueGreenDeployment> canDelete(
234
241
}
235
242
}
236
243
237
- private UpdateControl <FlinkBlueGreenDeployment > retryOrAbort (
244
+ private UpdateControl <FlinkBlueGreenDeployment > shouldAbort (
238
245
FlinkBlueGreenDeployment bgDeployment ,
239
246
FlinkBlueGreenDeploymentStatus deploymentStatus ,
240
247
Context <FlinkBlueGreenDeployment > josdkContext ,
241
248
FlinkDeployment nextDeployment ,
242
249
FlinkBlueGreenDeploymentState nextState ) {
243
- int maxNumRetries = bgDeployment .getSpec ().getTemplate ().getMaxNumRetries ();
244
- if (maxNumRetries <= 0 ) {
245
- maxNumRetries = DEFAULT_MAX_NUM_RETRIES ;
250
+
251
+ String deploymentName = nextDeployment .getMetadata ().getName ();
252
+ long abortTimestamp = deploymentStatus .getAbortTimestamp ();
253
+
254
+ if (abortTimestamp == 0 ) {
255
+ throw new IllegalStateException ("Unexpected abortTimestamp == 0" );
246
256
}
247
257
248
- if (deploymentStatus . getNumRetries () >= maxNumRetries ) {
258
+ if (abortTimestamp < System . currentTimeMillis () ) {
249
259
// ABORT
250
260
// Suspend the nextDeployment (FlinkDeployment)
251
- nextDeployment .getStatus ().getJobStatus ().setState (JobStatus .SUSPENDED );
252
- josdkContext .getClient ().resource (nextDeployment ).replace ();
261
+ nextDeployment .getSpec ().getJob ().setState (JobState .SUSPENDED );
262
+ josdkContext .getClient ().resource (nextDeployment ).update ();
253
263
254
264
// We indicate this Blue/Green deployment is no longer Transitioning
255
265
// and rollback the state value
256
- deploymentStatus . setBlueGreenState (
266
+ var previousState =
257
267
nextState == FlinkBlueGreenDeploymentState .ACTIVE_BLUE
258
268
? FlinkBlueGreenDeploymentState .ACTIVE_GREEN
259
- : FlinkBlueGreenDeploymentState .ACTIVE_BLUE );
269
+ : FlinkBlueGreenDeploymentState .ACTIVE_BLUE ;
270
+
271
+ deploymentStatus .setBlueGreenState (previousState );
272
+
273
+ LOG .warn (
274
+ "Aborting deployment '"
275
+ + deploymentName
276
+ + "', rolling B/G deployment back to "
277
+ + previousState );
260
278
261
279
// If the current running FlinkDeployment is not in RUNNING/STABLE,
262
280
// we flag this Blue/Green as FAILING
263
281
return patchStatusUpdateControl (
264
282
bgDeployment , deploymentStatus , null , JobStatus .FAILING );
265
283
} else {
266
284
// RETRY
267
- deploymentStatus .setNumRetries (deploymentStatus .getNumRetries () + 1 );
268
-
269
- LOG .info ("Deployment " + nextDeployment .getMetadata ().getName () + " not ready yet" );
285
+ var delay = abortTimestamp - System .currentTimeMillis ();
286
+ LOG .info (
287
+ "Deployment '"
288
+ + deploymentName
289
+ + "' not ready yet, retrying in "
290
+ + delay
291
+ + " ms" );
270
292
return patchStatusUpdateControl (bgDeployment , deploymentStatus , null , null )
271
- .rescheduleAfter (getReconciliationReschedInterval ( bgDeployment ) );
293
+ .rescheduleAfter (delay );
272
294
}
273
295
}
274
296
@@ -290,34 +312,39 @@ private UpdateControl<FlinkBlueGreenDeployment> deleteAndFinalize(
290
312
291
313
if (currentDeployment != null ) {
292
314
deleteDeployment (currentDeployment , josdkContext );
293
- return noUpdate ();
315
+ return UpdateControl .< FlinkBlueGreenDeployment > noUpdate (). rescheduleAfter ( 500 );
294
316
} else {
317
+ LOG .info (
318
+ "Finalizing deployment '"
319
+ + bgDeployment .getMetadata ().getName ()
320
+ + "' to "
321
+ + nextState
322
+ + " state" );
295
323
deploymentStatus .setDeploymentReadyTimestamp (0 );
324
+ deploymentStatus .setAbortTimestamp (0 );
296
325
return patchStatusUpdateControl (
297
326
bgDeployment , deploymentStatus , nextState , JobStatus .RUNNING );
298
327
}
299
328
}
300
329
301
330
private UpdateControl <FlinkBlueGreenDeployment > checkAndInitiateDeployment (
302
- FlinkBlueGreenDeployment flinkBlueGreenDeployment ,
331
+ FlinkBlueGreenDeployment bgDeployment ,
303
332
FlinkBlueGreenDeployments deployments ,
304
333
FlinkBlueGreenDeploymentStatus deploymentStatus ,
305
334
DeploymentType currentDeploymentType ,
306
335
Context <FlinkBlueGreenDeployment > josdkContext )
307
336
throws Exception {
308
337
309
- if (hasSpecChanged (
310
- flinkBlueGreenDeployment .getSpec (), deploymentStatus , currentDeploymentType )) {
338
+ if (hasSpecChanged (bgDeployment .getSpec (), deploymentStatus )) {
311
339
312
340
// Ack the change in the spec (setLastReconciledSpec)
313
- setLastReconciledSpec (flinkBlueGreenDeployment , deploymentStatus );
341
+ setLastReconciledSpec (bgDeployment , deploymentStatus );
314
342
315
343
FlinkDeployment currentFlinkDeployment =
316
344
DeploymentType .BLUE == currentDeploymentType
317
345
? deployments .getFlinkDeploymentBlue ()
318
346
: deployments .getFlinkDeploymentGreen ();
319
347
320
- // spec, report the error and abort
321
348
if (isDeploymentReady (currentFlinkDeployment , josdkContext , deploymentStatus )) {
322
349
323
350
DeploymentType nextDeploymentType = DeploymentType .BLUE ;
@@ -335,7 +362,7 @@ private UpdateControl<FlinkBlueGreenDeployment> checkAndInitiateDeployment(
335
362
Savepoint lastCheckpoint = configureSavepoint (resourceContext );
336
363
337
364
return initiateDeployment (
338
- flinkBlueGreenDeployment ,
365
+ bgDeployment ,
339
366
deploymentStatus ,
340
367
nextDeploymentType ,
341
368
nextState ,
@@ -347,19 +374,19 @@ private UpdateControl<FlinkBlueGreenDeployment> checkAndInitiateDeployment(
347
374
// we flag this Blue/Green as FAILING
348
375
if (deploymentStatus .getJobStatus ().getState () != JobStatus .FAILING ) {
349
376
return patchStatusUpdateControl (
350
- flinkBlueGreenDeployment , deploymentStatus , null , JobStatus .FAILING );
377
+ bgDeployment , deploymentStatus , null , JobStatus .FAILING );
351
378
}
352
379
}
353
380
}
354
381
355
- return noUpdate ();
382
+ return UpdateControl . noUpdate ();
356
383
}
357
384
358
385
private static void setLastReconciledSpec (
359
- FlinkBlueGreenDeployment flinkBlueGreenDeployment ,
386
+ FlinkBlueGreenDeployment bgDeployment ,
360
387
FlinkBlueGreenDeploymentStatus deploymentStatus ) {
361
388
deploymentStatus .setLastReconciledSpec (
362
- SpecUtils .serializeObject (flinkBlueGreenDeployment .getSpec (), "spec" ));
389
+ SpecUtils .serializeObject (bgDeployment .getSpec (), "spec" ));
363
390
deploymentStatus .setLastReconciledTimestamp (System .currentTimeMillis ());
364
391
}
365
392
@@ -444,7 +471,7 @@ private static Savepoint configureSavepoint(
444
471
}
445
472
446
473
private UpdateControl <FlinkBlueGreenDeployment > initiateDeployment (
447
- FlinkBlueGreenDeployment flinkBlueGreenDeployment ,
474
+ FlinkBlueGreenDeployment bgDeployment ,
448
475
FlinkBlueGreenDeploymentStatus deploymentStatus ,
449
476
DeploymentType nextDeploymentType ,
450
477
FlinkBlueGreenDeploymentState nextState ,
@@ -453,15 +480,12 @@ private UpdateControl<FlinkBlueGreenDeployment> initiateDeployment(
453
480
boolean isFirstDeployment )
454
481
throws JsonProcessingException {
455
482
456
- deploy (
457
- flinkBlueGreenDeployment ,
458
- nextDeploymentType ,
459
- lastCheckpoint ,
460
- josdkContext ,
461
- isFirstDeployment );
483
+ deploy (bgDeployment , nextDeploymentType , lastCheckpoint , josdkContext , isFirstDeployment );
462
484
463
- return patchStatusUpdateControl (flinkBlueGreenDeployment , deploymentStatus , nextState , null )
464
- .rescheduleAfter (getReconciliationReschedInterval (flinkBlueGreenDeployment ));
485
+ setAbortTimestamp (bgDeployment , deploymentStatus );
486
+
487
+ return patchStatusUpdateControl (bgDeployment , deploymentStatus , nextState , null )
488
+ .rescheduleAfter (getReconciliationReschedInterval (bgDeployment ));
465
489
}
466
490
467
491
private boolean isDeploymentReady (
@@ -508,9 +532,7 @@ private static Stream<PodResource> getDeploymentPods(
508
532
}
509
533
510
534
private boolean hasSpecChanged (
511
- FlinkBlueGreenDeploymentSpec newSpec ,
512
- FlinkBlueGreenDeploymentStatus deploymentStatus ,
513
- DeploymentType deploymentType ) {
535
+ FlinkBlueGreenDeploymentSpec newSpec , FlinkBlueGreenDeploymentStatus deploymentStatus ) {
514
536
515
537
String lastReconciledSpec = deploymentStatus .getLastReconciledSpec ();
516
538
String newSpecSerialized = SpecUtils .serializeObject (newSpec , "spec" );
@@ -582,21 +604,23 @@ private void deploy(
582
604
583
605
private static void deleteDeployment (
584
606
FlinkDeployment currentDeployment , Context <FlinkBlueGreenDeployment > josdkContext ) {
607
+ String deploymentName = currentDeployment .getMetadata ().getName ();
585
608
List <StatusDetails > deletedStatus =
586
609
josdkContext
587
610
.getClient ()
588
611
.resources (FlinkDeployment .class )
589
612
.inNamespace (currentDeployment .getMetadata ().getNamespace ())
590
- .withName (currentDeployment . getMetadata (). getName () )
613
+ .withName (deploymentName )
591
614
.delete ();
592
615
593
616
boolean deleted =
594
617
deletedStatus .size () == 1
595
618
&& deletedStatus .get (0 ).getKind ().equals ("FlinkDeployment" );
619
+
596
620
if (!deleted ) {
597
- LOG .info ("Deployment not deleted, will retry" );
621
+ LOG .info ("Deployment '" + deploymentName + "' not deleted, will retry" );
598
622
} else {
599
- LOG .info ("Deployment deleted!" );
623
+ LOG .info ("Deployment '" + deploymentName + "' deleted!" );
600
624
}
601
625
}
602
626
0 commit comments