Skip to content

Commit 1ff9fb0

Browse files
authored
Merge pull request #6708 from cylc/8.4.x-sync
🤖 Merge 8.4.x-sync into master
2 parents 9ed2014 + 8dbdfbc commit 1ff9fb0

File tree

20 files changed

+264
-85
lines changed

20 files changed

+264
-85
lines changed

changes.d/6169.fix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Ensure that job submit/failure is logged, even when retries are planned.

changes.d/6589.fix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix potential accumulation of old families in UI.

cylc/flow/data_store_mgr.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1920,10 +1920,14 @@ def _family_ascent_point_prune(
19201920
family. The work back up to origin checking these families are active.
19211921
19221922
"""
1923-
fp_data = self.data[self.workflow_id][FAMILY_PROXIES]
19241923
fp_updated = self.updated[FAMILY_PROXIES]
1925-
if fp_id in fp_data:
1926-
fam_node = fp_data[fp_id]
1924+
fam_node = self.data[self.workflow_id][FAMILY_PROXIES].get(
1925+
fp_id,
1926+
self.added[FAMILY_PROXIES].get(fp_id, None)
1927+
)
1928+
# Should never be None,
1929+
# leaving in as protection against potential race conditions.
1930+
if fam_node is not None:
19271931
# Gather child families, then check/update recursively
19281932
for child_id in fam_node.child_families:
19291933
if child_id in checked_ids:
@@ -1938,11 +1942,14 @@ def _family_ascent_point_prune(
19381942
child_tasks.update(fp_updated[fp_id].child_tasks)
19391943
if fp_updated[fp_id].child_families:
19401944
child_families.update(fp_updated[fp_id].child_families)
1941-
# if any child tasks or families are in window, don't prune.
1945+
# If any child tasks or families are in window,
1946+
# then don't prune this family.
19421947
if (
19431948
child_tasks.difference(node_ids)
19441949
or child_families.difference(prune_ids)
19451950
):
1951+
# If any child tasks or families will be pruned,
1952+
# then update family states.
19461953
if (
19471954
child_tasks.intersection(node_ids)
19481955
or child_families.intersection(prune_ids)

cylc/flow/task_events_mgr.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -749,7 +749,7 @@ def process_message(
749749
# Already failed.
750750
return True
751751
if self._process_message_failed(
752-
itask, event_time, self.JOB_FAILED, forced
752+
itask, event_time, self.JOB_FAILED, forced, message
753753
):
754754
self.spawn_children(itask, TASK_OUTPUT_FAILED)
755755

@@ -798,7 +798,7 @@ def process_message(
798798
self.workflow_db_mgr.put_update_task_jobs(
799799
itask, {"run_signal": signal})
800800
if self._process_message_failed(
801-
itask, event_time, self.JOB_FAILED, forced
801+
itask, event_time, self.JOB_FAILED, forced, message
802802
):
803803
self.spawn_children(itask, TASK_OUTPUT_FAILED)
804804

@@ -815,7 +815,7 @@ def process_message(
815815
self.workflow_db_mgr.put_update_task_jobs(
816816
itask, {"run_signal": aborted_with})
817817
if self._process_message_failed(
818-
itask, event_time, aborted_with, forced
818+
itask, event_time, aborted_with, forced, message
819819
):
820820
self.spawn_children(itask, TASK_OUTPUT_FAILED)
821821

@@ -930,11 +930,15 @@ def _process_message_check(
930930
return False
931931

932932
severity_lvl: int = LOG_LEVELS.get(severity, INFO)
933+
# Don't log submit/failure messages here:
934+
if flag != self.FLAG_POLLED and message in {
935+
self.EVENT_SUBMIT_FAILED, f'{FAIL_MESSAGE_PREFIX}ERR'
936+
}:
937+
return True
933938
# Demote log level to DEBUG if this is a message that duplicates what
934939
# gets logged by itask state change anyway (and not manual poll)
935940
if severity_lvl > DEBUG and flag != self.FLAG_POLLED and message in {
936941
self.EVENT_SUBMITTED, self.EVENT_STARTED, self.EVENT_SUCCEEDED,
937-
self.EVENT_SUBMIT_FAILED, f'{FAIL_MESSAGE_PREFIX}ERR'
938942
}:
939943
severity_lvl = DEBUG
940944
LOG.log(severity_lvl, f"[{itask}] {flag}{message}{timestamp}")
@@ -1305,10 +1309,16 @@ def _process_message_failed(
13051309
event_time: Optional[str],
13061310
message: str,
13071311
forced: bool,
1312+
full_message: str,
13081313
) -> bool:
13091314
"""Helper for process_message, handle a failed message.
13101315
13111316
Return True if no retries (hence go to the failed state).
1317+
1318+
Args:
1319+
full_message:
1320+
If we have retries lined up we still tell users what
1321+
happened to cause the this attempt to fail.
13121322
"""
13131323
no_retries = False
13141324
if event_time is None:
@@ -1321,6 +1331,7 @@ def _process_message_failed(
13211331
"run_status": 1,
13221332
"time_run_exit": event_time,
13231333
})
1334+
LOG.error(f'[{itask}] {full_message or self.EVENT_FAILED}')
13241335
if (
13251336
forced
13261337
or TimerFlags.EXECUTION_RETRY not in itask.try_timers
@@ -1345,7 +1356,7 @@ def _process_message_failed(
13451356
timer = itask.try_timers[TimerFlags.EXECUTION_RETRY]
13461357
self._retry_task(itask, timer.timeout)
13471358
delay_msg = f"retrying in {timer.delay_timeout_as_str()}"
1348-
LOG.warning(f"[{itask}] {delay_msg}")
1359+
LOG.warning(f'[{itask}] - {delay_msg}')
13491360
msg = f"{self.JOB_FAILED}, {delay_msg}"
13501361
self.setup_event_handlers(itask, self.EVENT_RETRY, msg)
13511362
self._reset_job_timers(itask)
@@ -1423,14 +1434,14 @@ def _process_message_submit_failed(
14231434
Return True if no retries (hence go to the submit-failed state).
14241435
"""
14251436
no_retries = False
1426-
LOG.critical(f"[{itask}] {self.EVENT_SUBMIT_FAILED}")
14271437
if event_time is None:
14281438
event_time = get_current_time_string()
14291439
self.workflow_db_mgr.put_update_task_jobs(itask, {
14301440
"time_submit_exit": event_time,
14311441
"submit_status": 1,
14321442
})
14331443
itask.summary['submit_method_id'] = None
1444+
LOG.error(f"[{itask}] {self.EVENT_SUBMIT_FAILED}")
14341445
if (
14351446
forced
14361447
or TimerFlags.SUBMISSION_RETRY not in itask.try_timers

tests/functional/cylc-remove/00-simple/flow.cylc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
script = false
1818
[[cleaner]]
1919
script = """
20-
cylc__job__poll_grep_workflow_log -E '1/b/01:running.* \(received\)failed'
20+
cylc__job__poll_grep_workflow_log -E '1/b/01.* failed'
2121
# Remove the unhandled failed task
2222
cylc remove "$CYLC_WORKFLOW_ID//1/b"
2323
# Remove waiting 1/c

tests/functional/cylc-remove/02-cycling/flow.cylc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
[runtime]
1818
[[remover]]
1919
script = """
20-
cylc__job__poll_grep_workflow_log -E '2020/bar/01:running.* \(received\)failed'
21-
cylc__job__poll_grep_workflow_log -E '2021/baz/01:running.* \(received\)failed'
20+
cylc__job__poll_grep_workflow_log -E '2020/bar/01.* failed'
21+
cylc__job__poll_grep_workflow_log -E '2021/baz/01.* failed'
2222
# Remove the two unhandled failed tasks.
2323
cylc remove "$CYLC_WORKFLOW_ID//*/ba*:failed"
2424
# Remove the two unsatisfied waiting tasks.

tests/functional/cylc-trigger/02-filter-failed/flow.cylc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818
[[fixer]]
1919
script = """
2020
cylc__job__wait_cylc_message_started
21-
cylc__job__poll_grep_workflow_log -E '1/fixable1/01:running.* \(received\)failed'
22-
cylc__job__poll_grep_workflow_log -E '1/fixable2/01:running.* \(received\)failed'
23-
cylc__job__poll_grep_workflow_log -E '1/fixable3/01:running.* \(received\)failed'
21+
cylc__job__poll_grep_workflow_log -E '\[1/fixable1/01:running\] failed/ERR'
22+
cylc__job__poll_grep_workflow_log -E '\[1/fixable2/01:running\] failed/ERR'
23+
cylc__job__poll_grep_workflow_log -E '\[1/fixable3/01:running\] failed/ERR'
2424
cylc trigger "${CYLC_WORKFLOW_ID}//1/fixable*"
2525
"""
2626
[[Z]]

tests/functional/cylc-trigger/04-filter-names/flow.cylc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,11 @@
2222
[[fixer]]
2323
script = """
2424
cylc__job__wait_cylc_message_started
25-
cylc__job__poll_grep_workflow_log -E '1/fixable-1a/01.* \(received\)failed'
26-
cylc__job__poll_grep_workflow_log -E '1/fixable-1b/01.* \(received\)failed'
27-
cylc__job__poll_grep_workflow_log -E '1/fixable-2a/01.* \(received\)failed'
28-
cylc__job__poll_grep_workflow_log -E '1/fixable-2b/01.* \(received\)failed'
29-
cylc__job__poll_grep_workflow_log -E '1/fixable-3/01.* \(received\)failed'
25+
cylc__job__poll_grep_workflow_log -E '1/fixable-1a/01.* failed'
26+
cylc__job__poll_grep_workflow_log -E '1/fixable-1b/01.* failed'
27+
cylc__job__poll_grep_workflow_log -E '1/fixable-2a/01.* failed'
28+
cylc__job__poll_grep_workflow_log -E '1/fixable-2b/01.* failed'
29+
cylc__job__poll_grep_workflow_log -E '1/fixable-3/01.* failed'
3030
cylc trigger "${CYLC_WORKFLOW_ID}//" \
3131
'//1/FIXABLE-1' '//1/fixable-2*' '//1/fixable-3'
3232
"""

tests/functional/hold-release/11-retrying/flow.cylc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ t-retry-able => t-analyse
1818
[[t-hold-release]]
1919
script = """
2020
cylc__job__poll_grep_workflow_log -E \
21-
'1/t-retry-able/01:running.* \(received\)failed'
21+
'\[1/t-retry-able:waiting\] - retrying'
2222
2323
cylc__job__poll_grep_workflow_log -E \
2424
'1/t-retry-able/01:running.* => waiting'

tests/functional/reload/25-xtriggers.t

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__'
4242
[[reload]]
4343
script = """
4444
# wait for "broken" to fail
45-
cylc__job__poll_grep_workflow_log -E '1/broken/01.* \(received\)failed/ERR'
45+
cylc__job__poll_grep_workflow_log -E '1/broken/01.*failed/ERR'
4646
# fix "broken" to allow it to pass
4747
sed -i 's/false/true/' "${CYLC_WORKFLOW_RUN_DIR}/flow.cylc"
4848
# reload the workflow
@@ -63,7 +63,7 @@ workflow_run_ok "${TEST_NAME_BASE}-run" cylc play "${WORKFLOW_NAME}" --no-detach
6363
log_scan "${TEST_NAME_BASE}-scan" \
6464
"$(cylc cat-log -m p "${WORKFLOW_NAME}")" \
6565
1 1 \
66-
'1/broken.* (received)failed/ERR'
66+
'1/broken/01.*failed/ERR'
6767

6868
log_scan "${TEST_NAME_BASE}-scan" \
6969
"$(cylc cat-log -m p "${WORKFLOW_NAME}")" 1 1 \

0 commit comments

Comments
 (0)