@@ -150,32 +150,12 @@ public void verifyBasicTransition(
150150 private TestingFlinkBlueGreenDeploymentController .BlueGreenReconciliationResult handleSavepoint (
151151 TestingFlinkBlueGreenDeploymentController .BlueGreenReconciliationResult rs )
152152 throws Exception {
153- return handleSavepointWithFailure (rs , null );
154- }
155-
156- @ NotNull
157- private TestingFlinkBlueGreenDeploymentController .BlueGreenReconciliationResult
158- handleSavepointWithFailure (
159- TestingFlinkBlueGreenDeploymentController .BlueGreenReconciliationResult rs ,
160- Exception expectedException )
161- throws Exception {
162-
163- if (expectedException != null ) {
164- flinkService .setSavepointTriggerException (expectedException );
165- }
166153
167154 var triggers = flinkService .getSavepointTriggers ();
168155 triggers .clear ();
169156
170157 rs = reconcile (rs .deployment );
171158
172- if (expectedException != null ) {
173- // Should fail immediately without entering savepointing state
174- assertFailingJobStatus (rs );
175- return rs ;
176- }
177-
178- // Continue with existing successful savepoint logic...
179159 // Simulating a pending savepoint
180160 triggers .put (rs .deployment .getStatus ().getSavepointTriggerId (), false );
181161
@@ -189,6 +169,9 @@ private TestingFlinkBlueGreenDeploymentController.BlueGreenReconciliationResult
189169 // This next reconciliation should continue waiting on the pending savepoint
190170 rs = reconcile (rs .deployment );
191171
172+ // NOTE: internally the above reconcile call invokes the fetchSavepointInfo on the trigger,
173+ // the TestFlinkService automatically sets it to "true" (completed)
174+
192175 assertTrue (rs .updateControl .isNoUpdate ());
193176 assertTrue (rs .updateControl .getScheduleDelay ().isPresent ());
194177
@@ -525,8 +508,118 @@ public void verifySavepointFailureWithDifferentExceptionTypes(
525508 assertEquals (1 , flinkDeployments .size ());
526509 }
527510
511+ @ ParameterizedTest
512+ @ MethodSource ("org.apache.flink.kubernetes.operator.TestUtils#flinkVersions" )
513+ public void verifySavepointFetchFailureRecovery (FlinkVersion flinkVersion ) throws Exception {
514+ var blueGreenDeployment =
515+ buildSessionCluster (
516+ TEST_DEPLOYMENT_NAME ,
517+ TEST_NAMESPACE ,
518+ flinkVersion ,
519+ null ,
520+ UpgradeMode .SAVEPOINT );
521+
522+ var rs = executeBasicDeployment (flinkVersion , blueGreenDeployment , false , null );
523+
524+ String customValue = UUID .randomUUID ().toString ();
525+ simulateChangeInSpec (rs .deployment , customValue , 0 , null );
526+
527+ // Trigger savepoint successfully and go through savepointing flow
528+ rs = handleSavepoint (rs );
529+
530+ // Now configure service to return fetch error - this will be detected in
531+ // configureInitialSavepoint
532+ flinkService .setSavepointFetchError ("Savepoint corrupted or not found" );
533+
534+ // The next reconciliation should transition back to ACTIVE_BLUE and then try
535+ // startTransition
536+ // which will fail in configureInitialSavepoint due to fetch error
537+ rs = reconcile (rs .deployment );
538+ assertFailingJobStatus (rs );
539+ assertTrue (rs .reconciledStatus .getError ().contains ("Could not start Transition" ));
540+ assertTrue (rs .reconciledStatus .getError ().contains ("Savepoint corrupted or not found" ));
541+
542+ // Recovery: Clear the fetch error and try again with new spec change
543+ flinkService .clearSavepointFetchError ();
544+ customValue = UUID .randomUUID ().toString () + "_recovery" ;
545+ simulateChangeInSpec (rs .deployment , customValue , ALT_DELETION_DELAY_VALUE , null );
546+
547+ // Should now succeed and complete transition properly
548+ rs = handleSavepoint (rs );
549+
550+ // Continue with successful transition - second savepoint will be "savepoint_2"
551+ testTransitionToGreen (rs , customValue , "savepoint_2" );
552+ }
553+
554+ @ ParameterizedTest
555+ @ MethodSource ("savepointFetchErrorProvider" )
556+ public void verifySavepointFetchFailureWithDifferentErrors (
557+ FlinkVersion flinkVersion , String fetchError , String expectedErrorFragment )
558+ throws Exception {
559+
560+ var blueGreenDeployment =
561+ buildSessionCluster (
562+ TEST_DEPLOYMENT_NAME ,
563+ TEST_NAMESPACE ,
564+ flinkVersion ,
565+ null ,
566+ UpgradeMode .SAVEPOINT );
567+ var rs = executeBasicDeployment (flinkVersion , blueGreenDeployment , false , null );
568+
569+ simulateChangeInSpec (rs .deployment , UUID .randomUUID ().toString (), 0 , null );
570+
571+ // Trigger savepoint successfully and go through savepointing flow
572+ rs = handleSavepoint (rs );
573+
574+ // Configure service to return fetch error - this will be detected in
575+ // configureInitialSavepoint
576+ flinkService .setSavepointFetchError (fetchError );
577+
578+ // The next reconciliation should transition back to ACTIVE_BLUE and then try
579+ // startTransition
580+ // which will fail in configureInitialSavepoint due to fetch error
581+ rs = reconcile (rs .deployment );
582+
583+ assertFailingJobStatus (rs );
584+ assertTrue (rs .reconciledStatus .getError ().contains ("Could not start Transition" ));
585+ assertTrue (rs .reconciledStatus .getError ().contains (expectedErrorFragment ));
586+
587+ // Should remain in ACTIVE_BLUE state after failure
588+ assertEquals (
589+ FlinkBlueGreenDeploymentState .ACTIVE_BLUE , rs .reconciledStatus .getBlueGreenState ());
590+
591+ // Only Blue deployment should exist (Green transition never started)
592+ var flinkDeployments = getFlinkDeployments ();
593+ assertEquals (1 , flinkDeployments .size ());
594+ }
595+
528596 // ==================== Parameterized Test Inputs ====================
529597
598+ static Stream <Arguments > savepointFetchErrorProvider () {
599+ return TestUtils .flinkVersions ()
600+ .flatMap (
601+ flinkVersionArgs -> {
602+ FlinkVersion version = (FlinkVersion ) flinkVersionArgs .get ()[0 ];
603+ return Stream .of (
604+ Arguments .of (
605+ version ,
606+ "Savepoint file corrupted" ,
607+ "Savepoint file corrupted" ),
608+ Arguments .of (
609+ version ,
610+ "Storage system unavailable" ,
611+ "Storage system unavailable" ),
612+ Arguments .of (
613+ version ,
614+ "Access denied to savepoint location" ,
615+ "Access denied to savepoint location" ),
616+ Arguments .of (
617+ version ,
618+ "Savepoint metadata missing" ,
619+ "Savepoint metadata missing" ));
620+ });
621+ }
622+
530623 static Stream <Arguments > savepointExceptionProvider () {
531624 return TestUtils .flinkVersions ()
532625 .flatMap (
0 commit comments