Skip to content

Commit 4722465

Browse files
committed
Adds check for exceptions when prevState is terminal
1 parent f1e9320 commit 4722465

File tree

2 files changed

+79
-4
lines changed

2 files changed

+79
-4
lines changed

flink-kubernetes-operator/src/main/java/org/apache/flink/kubernetes/operator/observer/JobStatusObserver.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
import java.util.HashMap;
4242
import java.util.Map;
4343
import java.util.concurrent.TimeoutException;
44-
import java.util.regex.Pattern;
4544

4645
import static org.apache.flink.kubernetes.operator.utils.FlinkResourceExceptionUtils.updateFlinkResourceException;
4746

@@ -51,8 +50,6 @@ public class JobStatusObserver<R extends AbstractFlinkResource<?, ?>> {
5150
private static final Logger LOG = LoggerFactory.getLogger(JobStatusObserver.class);
5251

5352
public static final String JOB_NOT_FOUND_ERR = "Job Not Found";
54-
private static final Pattern VALID_K8S_ANNOTATION_KEY_PATTERN =
55-
Pattern.compile("^[a-zA-Z0-9./-]{1,63}$");
5653

5754
protected final EventRecorder eventRecorder;
5855

@@ -90,7 +87,6 @@ public boolean observe(FlinkResourceContext<R> ctx) {
9087
updateJobStatus(ctx, newJobStatus);
9188
ReconciliationUtils.checkAndUpdateStableSpec(resource.getStatus());
9289
// see if the JM server is up, try to get the exceptions
93-
// in case the new
9490
if (!previousJobStatus.isGloballyTerminalState()) {
9591
observeJobManagerExceptions(ctx);
9692
}

flink-kubernetes-operator/src/test/java/org/apache/flink/kubernetes/operator/observer/JobStatusObserverTest.java

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,85 @@ void testFailed() throws Exception {
157157
assertTrue(flinkResourceEventCollector.events.isEmpty());
158158
}
159159

160+
@Test
161+
public void testExceptionObservedEvenWhenNewStateIsTerminal() throws Exception {
162+
var deployment = initDeployment();
163+
var status = deployment.getStatus();
164+
var jobStatus = status.getJobStatus();
165+
jobStatus.setState(JobStatus.RUNNING);
166+
Map<String, String> configuration = new HashMap<>();
167+
configuration.put(
168+
KubernetesOperatorConfigOptions.OPERATOR_EVENT_EXCEPTION_LIMIT.key(), "2");
169+
Configuration operatorConfig = Configuration.fromMap(configuration);
170+
FlinkResourceContext<AbstractFlinkResource<?, ?>> ctx =
171+
getResourceContext(deployment, operatorConfig);
172+
173+
var jobId = JobID.fromHexString(deployment.getStatus().getJobStatus().getJobId());
174+
ctx.getExceptionCacheEntry().setJobId(jobId.toHexString());
175+
ctx.getExceptionCacheEntry().setLastTimestamp(500L);
176+
flinkService.addExceptionHistory(jobId, "ExceptionOne", "trace1", 1000L);
177+
178+
// Ensure jobFailedErr is null before the observe call
179+
flinkService.submitApplicationCluster(
180+
deployment.getSpec().getJob(), ctx.getDeployConfig(deployment.getSpec()), false);
181+
flinkService.cancelJob(JobID.fromHexString(jobStatus.getJobId()), false);
182+
flinkService.setJobFailedErr(null);
183+
184+
observer.observe(ctx);
185+
186+
var events =
187+
kubernetesClient
188+
.v1()
189+
.events()
190+
.inNamespace(deployment.getMetadata().getNamespace())
191+
.list()
192+
.getItems();
193+
assertEquals(2, events.size()); // one will be for job status changed
194+
// assert that none of the events contain JOB_NOT_FOUND_ERR
195+
assertFalse(
196+
events.stream()
197+
.anyMatch(
198+
event ->
199+
event.getMessage()
200+
.contains(JobStatusObserver.JOB_NOT_FOUND_ERR)));
201+
}
202+
203+
@Test
204+
public void testExceptionNotObservedWhenOldStateIsTerminal() throws Exception {
205+
var deployment = initDeployment();
206+
var status = deployment.getStatus();
207+
var jobStatus = status.getJobStatus();
208+
jobStatus.setState(JobStatus.CANCELED);
209+
Map<String, String> configuration = new HashMap<>();
210+
configuration.put(
211+
KubernetesOperatorConfigOptions.OPERATOR_EVENT_EXCEPTION_LIMIT.key(), "2");
212+
Configuration operatorConfig = Configuration.fromMap(configuration);
213+
FlinkResourceContext<AbstractFlinkResource<?, ?>> ctx =
214+
getResourceContext(deployment, operatorConfig);
215+
216+
var jobId = JobID.fromHexString(deployment.getStatus().getJobStatus().getJobId());
217+
ctx.getExceptionCacheEntry().setJobId(jobId.toHexString());
218+
ctx.getExceptionCacheEntry().setLastTimestamp(500L);
219+
flinkService.addExceptionHistory(jobId, "ExceptionOne", "trace1", 1000L);
220+
221+
// Ensure jobFailedErr is null before the observe call
222+
flinkService.submitApplicationCluster(
223+
deployment.getSpec().getJob(), ctx.getDeployConfig(deployment.getSpec()), false);
224+
flinkService.setJobFailedErr(null);
225+
226+
observer.observe(ctx);
227+
228+
var events =
229+
kubernetesClient
230+
.v1()
231+
.events()
232+
.inNamespace(deployment.getMetadata().getNamespace())
233+
.list()
234+
.getItems();
235+
assertEquals(1, events.size()); // only one event for job status changed
236+
assertEquals(EventRecorder.Reason.JobStatusChanged.name(), events.get(0).getReason());
237+
}
238+
160239
@Test
161240
public void testExceptionLimitConfig() throws Exception {
162241
var observer = new JobStatusObserver<>(eventRecorder);

0 commit comments

Comments
 (0)