@@ -749,7 +749,7 @@ def process_message(
749
749
# Already failed.
750
750
return True
751
751
if self ._process_message_failed (
752
- itask , event_time , self .JOB_FAILED , forced
752
+ itask , event_time , self .JOB_FAILED , forced , message
753
753
):
754
754
self .spawn_children (itask , TASK_OUTPUT_FAILED )
755
755
@@ -798,7 +798,7 @@ def process_message(
798
798
self .workflow_db_mgr .put_update_task_jobs (
799
799
itask , {"run_signal" : signal })
800
800
if self ._process_message_failed (
801
- itask , event_time , self .JOB_FAILED , forced
801
+ itask , event_time , self .JOB_FAILED , forced , message
802
802
):
803
803
self .spawn_children (itask , TASK_OUTPUT_FAILED )
804
804
@@ -815,7 +815,7 @@ def process_message(
815
815
self .workflow_db_mgr .put_update_task_jobs (
816
816
itask , {"run_signal" : aborted_with })
817
817
if self ._process_message_failed (
818
- itask , event_time , aborted_with , forced
818
+ itask , event_time , aborted_with , forced , message
819
819
):
820
820
self .spawn_children (itask , TASK_OUTPUT_FAILED )
821
821
@@ -930,11 +930,15 @@ def _process_message_check(
930
930
return False
931
931
932
932
severity_lvl : int = LOG_LEVELS .get (severity , INFO )
933
+ # Don't log submit/failure messages here:
934
+ if flag != self .FLAG_POLLED and message in {
935
+ self .EVENT_SUBMIT_FAILED , f'{ FAIL_MESSAGE_PREFIX } ERR'
936
+ }:
937
+ return True
933
938
# Demote log level to DEBUG if this is a message that duplicates what
934
939
# gets logged by itask state change anyway (and not manual poll)
935
940
if severity_lvl > DEBUG and flag != self .FLAG_POLLED and message in {
936
941
self .EVENT_SUBMITTED , self .EVENT_STARTED , self .EVENT_SUCCEEDED ,
937
- self .EVENT_SUBMIT_FAILED , f'{ FAIL_MESSAGE_PREFIX } ERR'
938
942
}:
939
943
severity_lvl = DEBUG
940
944
LOG .log (severity_lvl , f"[{ itask } ] { flag } { message } { timestamp } " )
@@ -1305,10 +1309,16 @@ def _process_message_failed(
1305
1309
event_time : Optional [str ],
1306
1310
message : str ,
1307
1311
forced : bool ,
1312
+ full_message : str ,
1308
1313
) -> bool :
1309
1314
"""Helper for process_message, handle a failed message.
1310
1315
1311
1316
Return True if no retries (hence go to the failed state).
1317
+
1318
+ Args:
1319
+ full_message:
1320
+ If we have retries lined up we still tell users what
1321
+ happened to cause the this attempt to fail.
1312
1322
"""
1313
1323
no_retries = False
1314
1324
if event_time is None :
@@ -1321,6 +1331,7 @@ def _process_message_failed(
1321
1331
"run_status" : 1 ,
1322
1332
"time_run_exit" : event_time ,
1323
1333
})
1334
+ LOG .error (f'[{ itask } ] { full_message or self .EVENT_FAILED } ' )
1324
1335
if (
1325
1336
forced
1326
1337
or TimerFlags .EXECUTION_RETRY not in itask .try_timers
@@ -1345,7 +1356,7 @@ def _process_message_failed(
1345
1356
timer = itask .try_timers [TimerFlags .EXECUTION_RETRY ]
1346
1357
self ._retry_task (itask , timer .timeout )
1347
1358
delay_msg = f"retrying in { timer .delay_timeout_as_str ()} "
1348
- LOG .warning (f" [{ itask } ] { delay_msg } " )
1359
+ LOG .warning (f' [{ itask } ] - { delay_msg } ' )
1349
1360
msg = f"{ self .JOB_FAILED } , { delay_msg } "
1350
1361
self .setup_event_handlers (itask , self .EVENT_RETRY , msg )
1351
1362
self ._reset_job_timers (itask )
@@ -1423,14 +1434,14 @@ def _process_message_submit_failed(
1423
1434
Return True if no retries (hence go to the submit-failed state).
1424
1435
"""
1425
1436
no_retries = False
1426
- LOG .critical (f"[{ itask } ] { self .EVENT_SUBMIT_FAILED } " )
1427
1437
if event_time is None :
1428
1438
event_time = get_current_time_string ()
1429
1439
self .workflow_db_mgr .put_update_task_jobs (itask , {
1430
1440
"time_submit_exit" : event_time ,
1431
1441
"submit_status" : 1 ,
1432
1442
})
1433
1443
itask .summary ['submit_method_id' ] = None
1444
+ LOG .error (f"[{ itask } ] { self .EVENT_SUBMIT_FAILED } " )
1434
1445
if (
1435
1446
forced
1436
1447
or TimerFlags .SUBMISSION_RETRY not in itask .try_timers
0 commit comments