@@ -418,6 +418,313 @@ public void testExceptionEventTriggerInitialization() throws Exception {
418
418
ctx .getExceptionCacheEntry ().getLastTimestamp ());
419
419
}
420
420
421
+ @ Test
422
+ public void testSessionJobExceptionObservedEvenWhenNewStateIsTerminal () throws Exception {
423
+ var sessionJob = initSessionJob ();
424
+ var status = sessionJob .getStatus ();
425
+ var jobStatus = status .getJobStatus ();
426
+ jobStatus .setState (JobStatus .RUNNING );
427
+ Map <String , String > configuration = new HashMap <>();
428
+ configuration .put (
429
+ KubernetesOperatorConfigOptions .OPERATOR_EVENT_EXCEPTION_LIMIT .key (), "2" );
430
+ Configuration operatorConfig = Configuration .fromMap (configuration );
431
+ FlinkResourceContext <AbstractFlinkResource <?, ?>> ctx =
432
+ getResourceContext (
433
+ sessionJob ,
434
+ TestUtils .createContextWithReadyFlinkDeployment (kubernetesClient ),
435
+ operatorConfig );
436
+
437
+ var jobId = JobID .fromHexString (sessionJob .getStatus ().getJobStatus ().getJobId ());
438
+ ctx .getExceptionCacheEntry ().setInitialized (true );
439
+ ctx .getExceptionCacheEntry ().setJobId (jobId .toHexString ());
440
+ ctx .getExceptionCacheEntry ().setLastTimestamp (Instant .ofEpochMilli (500L ));
441
+ flinkService .addExceptionHistory (jobId , "SessionJobExceptionOne" , "trace1" , 1000L );
442
+
443
+ // Submit the session job
444
+ flinkService .submitJobToSessionCluster (
445
+ sessionJob .getMetadata (),
446
+ sessionJob .getSpec (),
447
+ jobId ,
448
+ ctx .getDeployConfig (sessionJob .getSpec ()),
449
+ null );
450
+
451
+ // Cancel the job to make it terminal
452
+ flinkService .cancelJob (jobId , false );
453
+ flinkService .setJobFailedErr (null );
454
+
455
+ observer .observe (ctx );
456
+
457
+ var events =
458
+ kubernetesClient
459
+ .v1 ()
460
+ .events ()
461
+ .inNamespace (sessionJob .getMetadata ().getNamespace ())
462
+ .list ()
463
+ .getItems ();
464
+ assertEquals (2 , events .size ()); // one will be for job status changed
465
+ // assert that none of the events contain JOB_NOT_FOUND_ERR
466
+ assertFalse (
467
+ events .stream ()
468
+ .anyMatch (
469
+ event ->
470
+ event .getMessage ()
471
+ .contains (JobStatusObserver .JOB_NOT_FOUND_ERR )));
472
+ }
473
+
474
+ @ Test
475
+ public void testSessionJobExceptionNotObservedWhenOldStateIsTerminal () throws Exception {
476
+ var sessionJob = initSessionJob ();
477
+ var status = sessionJob .getStatus ();
478
+ var jobStatus = status .getJobStatus ();
479
+ jobStatus .setState (JobStatus .FINISHED ); // Set to terminal state
480
+
481
+ FlinkResourceContext <AbstractFlinkResource <?, ?>> ctx =
482
+ getResourceContext (
483
+ sessionJob ,
484
+ TestUtils .createContextWithReadyFlinkDeployment (kubernetesClient ));
485
+
486
+ var jobId = JobID .fromHexString (sessionJob .getStatus ().getJobStatus ().getJobId ());
487
+ flinkService .submitJobToSessionCluster (
488
+ sessionJob .getMetadata (),
489
+ sessionJob .getSpec (),
490
+ jobId ,
491
+ ctx .getDeployConfig (sessionJob .getSpec ()),
492
+ null );
493
+ ReconciliationUtils .updateStatusForDeployedSpec (sessionJob , new Configuration ());
494
+ ctx .getExceptionCacheEntry ().setInitialized (true );
495
+ ctx .getExceptionCacheEntry ().setJobId (jobId .toHexString ());
496
+ ctx .getExceptionCacheEntry ().setLastTimestamp (Instant .ofEpochMilli (3000L ));
497
+
498
+ long exceptionTime = 4000L ;
499
+ flinkService .addExceptionHistory (jobId , "SessionJobException" , "trace" , exceptionTime );
500
+
501
+ // Ensure jobFailedErr is null before the observe call
502
+ flinkService .setJobFailedErr (null );
503
+ observer .observe (ctx );
504
+
505
+ var events =
506
+ kubernetesClient
507
+ .v1 ()
508
+ .events ()
509
+ .inNamespace (sessionJob .getMetadata ().getNamespace ())
510
+ .list ()
511
+ .getItems ();
512
+ assertEquals (
513
+ 1 , events .size ()); // Only one event for job status changed, no exception events
514
+ assertEquals (EventRecorder .Reason .JobStatusChanged .name (), events .get (0 ).getReason ());
515
+ }
516
+
517
+ @ Test
518
+ public void testSessionJobExceptionLimitConfig () throws Exception {
519
+ var sessionJob = initSessionJob ();
520
+ var status = sessionJob .getStatus ();
521
+ var jobStatus = status .getJobStatus ();
522
+ jobStatus .setState (JobStatus .RUNNING );
523
+ Map <String , String > configuration = new HashMap <>();
524
+ configuration .put (
525
+ KubernetesOperatorConfigOptions .OPERATOR_EVENT_EXCEPTION_LIMIT .key (), "2" );
526
+ Configuration operatorConfig = Configuration .fromMap (configuration );
527
+ FlinkResourceContext <AbstractFlinkResource <?, ?>> ctx =
528
+ getResourceContext (
529
+ sessionJob ,
530
+ TestUtils .createContextWithReadyFlinkDeployment (kubernetesClient ),
531
+ operatorConfig );
532
+
533
+ var jobId = JobID .fromHexString (sessionJob .getStatus ().getJobStatus ().getJobId ());
534
+ flinkService .submitJobToSessionCluster (
535
+ sessionJob .getMetadata (),
536
+ sessionJob .getSpec (),
537
+ jobId ,
538
+ ctx .getDeployConfig (sessionJob .getSpec ()),
539
+ null );
540
+ ReconciliationUtils .updateStatusForDeployedSpec (sessionJob , new Configuration ());
541
+ ctx .getExceptionCacheEntry ().setInitialized (true );
542
+ ctx .getExceptionCacheEntry ().setJobId (jobId .toHexString ());
543
+ ctx .getExceptionCacheEntry ().setLastTimestamp (Instant .ofEpochMilli (3000L ));
544
+
545
+ // Add 3 exceptions, but only 2 should be reported due to limit
546
+ flinkService .addExceptionHistory (jobId , "SessionJobException1" , "trace1" , 4000L );
547
+ flinkService .addExceptionHistory (jobId , "SessionJobException2" , "trace2" , 5000L );
548
+ flinkService .addExceptionHistory (jobId , "SessionJobException3" , "trace3" , 6000L );
549
+
550
+ // Ensure jobFailedErr is null before the observe call
551
+ flinkService .setJobFailedErr (null );
552
+ observer .observe (ctx );
553
+
554
+ var events =
555
+ kubernetesClient
556
+ .v1 ()
557
+ .events ()
558
+ .inNamespace (sessionJob .getMetadata ().getNamespace ())
559
+ .list ()
560
+ .getItems ();
561
+ assertEquals (2 , events .size ()); // Only 2 exceptions should be reported
562
+ }
563
+
564
+ @ Test
565
+ public void testSessionJobStackTraceTruncationConfig () throws Exception {
566
+ var sessionJob = initSessionJob ();
567
+ var status = sessionJob .getStatus ();
568
+ var jobStatus = status .getJobStatus ();
569
+ jobStatus .setState (JobStatus .RUNNING );
570
+ Map <String , String > configuration = new HashMap <>();
571
+ configuration .put (
572
+ KubernetesOperatorConfigOptions .OPERATOR_EVENT_EXCEPTION_STACKTRACE_LINES .key (),
573
+ "2" );
574
+ Configuration operatorConfig = Configuration .fromMap (configuration );
575
+ FlinkResourceContext <AbstractFlinkResource <?, ?>> ctx =
576
+ getResourceContext (
577
+ sessionJob ,
578
+ TestUtils .createContextWithReadyFlinkDeployment (kubernetesClient ),
579
+ operatorConfig );
580
+
581
+ var jobId = JobID .fromHexString (sessionJob .getStatus ().getJobStatus ().getJobId ());
582
+ flinkService .submitJobToSessionCluster (
583
+ sessionJob .getMetadata (),
584
+ sessionJob .getSpec (),
585
+ jobId ,
586
+ ctx .getDeployConfig (sessionJob .getSpec ()),
587
+ null );
588
+ ReconciliationUtils .updateStatusForDeployedSpec (sessionJob , new Configuration ());
589
+ ctx .getExceptionCacheEntry ().setInitialized (true );
590
+ ctx .getExceptionCacheEntry ().setJobId (jobId .toHexString ());
591
+ ctx .getExceptionCacheEntry ().setLastTimestamp (Instant .ofEpochMilli (3000L ));
592
+
593
+ long exceptionTime = 4000L ;
594
+ String longTrace = "line1\n line2\n line3\n line4" ;
595
+ flinkService .addExceptionHistory (
596
+ jobId , "SessionJobStackTraceCheck" , longTrace , exceptionTime );
597
+
598
+ // Ensure jobFailedErr is null before the observe call
599
+ flinkService .setJobFailedErr (null );
600
+ observer .observe (ctx );
601
+
602
+ var events =
603
+ kubernetesClient
604
+ .v1 ()
605
+ .events ()
606
+ .inNamespace (sessionJob .getMetadata ().getNamespace ())
607
+ .list ()
608
+ .getItems ();
609
+ assertEquals (1 , events .size ());
610
+ String msg = events .get (0 ).getMessage ();
611
+ assertTrue (msg .contains ("line1" ));
612
+ assertTrue (msg .contains ("line2" ));
613
+ assertFalse (msg .contains ("line3" ));
614
+ assertTrue (msg .contains ("... (2 more lines)" ));
615
+ }
616
+
617
+ @ Test
618
+ public void testSessionJobIgnoreOldExceptions () throws Exception {
619
+ var sessionJob = initSessionJob ();
620
+ var status = sessionJob .getStatus ();
621
+ var jobStatus = status .getJobStatus ();
622
+ jobStatus .setState (JobStatus .RUNNING ); // set a non-terminal state
623
+
624
+ FlinkResourceContext <AbstractFlinkResource <?, ?>> ctx =
625
+ getResourceContext (
626
+ sessionJob ,
627
+ TestUtils .createContextWithReadyFlinkDeployment (kubernetesClient ));
628
+ ctx .getExceptionCacheEntry ().setInitialized (true );
629
+ ctx .getExceptionCacheEntry ().setJobId (sessionJob .getStatus ().getJobStatus ().getJobId ());
630
+ ctx .getExceptionCacheEntry ().setLastTimestamp (Instant .ofEpochMilli (2500L ));
631
+
632
+ var jobId = JobID .fromHexString (sessionJob .getStatus ().getJobStatus ().getJobId ());
633
+ flinkService .submitJobToSessionCluster (
634
+ sessionJob .getMetadata (),
635
+ sessionJob .getSpec (),
636
+ jobId ,
637
+ ctx .getDeployConfig (sessionJob .getSpec ()),
638
+ null );
639
+
640
+ // Map exception names to timestamps
641
+ Map <String , Long > exceptionHistory =
642
+ Map .of (
643
+ "OldSessionException" , 1000L ,
644
+ "MidSessionException" , 2000L ,
645
+ "NewSessionException" , 3000L );
646
+ String dummyStackTrace =
647
+ "org.apache.%s\n "
648
+ + "\t at org.apache.flink.kubernetes.operator.observer.JobStatusObserverTest.testSessionJobIgnoreOldExceptions(JobStatusObserverTest.java:1)\n "
649
+ + "\t at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n "
650
+ + "\t at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n "
651
+ + "\t at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n "
652
+ + "\t at java.base/java.lang.reflect.Method.invoke(Method.java:566)\n " ;
653
+ // Add mapped exceptions
654
+ exceptionHistory .forEach (
655
+ (exceptionName , timestamp ) -> {
656
+ String fullStackTrace = String .format (dummyStackTrace , exceptionName );
657
+ flinkService .addExceptionHistory (
658
+ jobId , "org.apache." + exceptionName , fullStackTrace , timestamp );
659
+ });
660
+
661
+ // Ensure jobFailedErr is null before the observe call
662
+ flinkService .setJobFailedErr (null );
663
+ observer .observe (ctx );
664
+
665
+ var events =
666
+ kubernetesClient
667
+ .v1 ()
668
+ .events ()
669
+ .inNamespace (sessionJob .getMetadata ().getNamespace ())
670
+ .list ()
671
+ .getItems ();
672
+ assertEquals (1 , events .size ());
673
+ assertTrue (events .get (0 ).getMessage ().contains ("org.apache.NewSessionException" ));
674
+ }
675
+
676
+ @ Test
677
+ public void testSessionJobExceptionEventTriggerInitialization () throws Exception {
678
+ var sessionJob = initSessionJob ();
679
+ var status = sessionJob .getStatus ();
680
+ var jobStatus = status .getJobStatus ();
681
+ jobStatus .setState (JobStatus .RUNNING ); // set a non-terminal state
682
+
683
+ FlinkResourceContext <AbstractFlinkResource <?, ?>> ctx =
684
+ getResourceContext (
685
+ sessionJob ,
686
+ TestUtils .createContextWithReadyFlinkDeployment (kubernetesClient ));
687
+
688
+ var now = Instant .now ();
689
+ var jobId = JobID .fromHexString (sessionJob .getStatus ().getJobStatus ().getJobId ());
690
+ flinkService .submitJobToSessionCluster (
691
+ sessionJob .getMetadata (),
692
+ sessionJob .getSpec (),
693
+ jobId ,
694
+ ctx .getDeployConfig (sessionJob .getSpec ()),
695
+ null );
696
+
697
+ // Old exception that happened outside of kubernetes event retention should be ignored
698
+ flinkService .addExceptionHistory (
699
+ jobId ,
700
+ "OldSessionException" ,
701
+ "OldSessionException" ,
702
+ now .minus (Duration .ofHours (1 )).toEpochMilli ());
703
+ flinkService .addExceptionHistory (
704
+ jobId ,
705
+ "NewSessionException" ,
706
+ "NewSessionException" ,
707
+ now .minus (Duration .ofMinutes (1 )).toEpochMilli ());
708
+
709
+ // Ensure jobFailedErr is null before the observe call
710
+ flinkService .setJobFailedErr (null );
711
+ observer .observe (ctx );
712
+
713
+ var events =
714
+ kubernetesClient
715
+ .v1 ()
716
+ .events ()
717
+ .inNamespace (sessionJob .getMetadata ().getNamespace ())
718
+ .list ()
719
+ .getItems ();
720
+ assertEquals (1 , events .size ());
721
+ assertTrue (events .get (0 ).getMessage ().contains ("NewSessionException" ));
722
+ assertTrue (ctx .getExceptionCacheEntry ().isInitialized ());
723
+ assertEquals (
724
+ now .minus (Duration .ofMinutes (1 )).truncatedTo (ChronoUnit .MILLIS ),
725
+ ctx .getExceptionCacheEntry ().getLastTimestamp ());
726
+ }
727
+
421
728
private static Stream <Arguments > cancellingArgs () {
422
729
var args = new ArrayList <Arguments >();
423
730
for (var status : JobStatus .values ()) {
0 commit comments