Skip to content

Commit 9c3cd49

Browse files
authored
platforms: fix unreachable hosts not reset on platform group failure (#6109)
1 parent 2ae50b2 commit 9c3cd49

File tree

6 files changed

+85
-5
lines changed

6 files changed

+85
-5
lines changed

changes.d/fix.6109.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fixed bug affecting job submission where the list of bad hosts was not always reset correctly.

cylc/flow/exceptions.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
Callable,
2222
Dict,
2323
Iterable,
24+
Set,
2425
NoReturn,
2526
Optional,
2627
Tuple,
@@ -444,15 +445,21 @@ class NoPlatformsError(PlatformLookupError):
444445
445446
Args:
446447
identity: The name of the platform group or install target
448+
hosts_consumed: Hosts which have already been tried.
447449
set_type: Whether the set of platforms is a platform group or an
448450
install target
449451
place: Where the attempt to get the platform failed.
450452
"""
451453
def __init__(
452-
self, identity: str, set_type: str = 'group', place: str = ''
454+
self,
455+
identity: str,
456+
hosts_consumed: Set[str],
457+
set_type: str = 'group',
458+
place: str = '',
453459
):
454460
self.identity = identity
455461
self.type = set_type
462+
self.hosts_consumed = hosts_consumed
456463
if place:
457464
self.place = f' during {place}.'
458465
else:

cylc/flow/platforms.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,9 +302,14 @@ def get_platform_from_group(
302302
else:
303303
platform_names = group['platforms']
304304

305-
# Return False if there are no platforms available to be selected.
305+
# If there are no platforms available to be selected:
306306
if not platform_names:
307-
raise NoPlatformsError(group_name)
307+
hosts_consumed = {
308+
host
309+
for platform in group['platforms']
310+
for host in platform_from_name(platform)['hosts']}
311+
raise NoPlatformsError(
312+
group_name, hosts_consumed)
308313

309314
# Get the selection method
310315
method = group['selection']['method']

cylc/flow/task_job_mgr.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,18 +267,21 @@ def submit_task_jobs(self, workflow, itasks, curve_auth,
267267
# Prepare tasks for job submission
268268
prepared_tasks, bad_tasks = self.prep_submit_task_jobs(
269269
workflow, itasks)
270+
270271
# Reset consumed host selection results
271272
self.task_remote_mgr.subshell_eval_reset()
272273

273274
if not prepared_tasks:
274275
return bad_tasks
276+
275277
auth_itasks = {} # {platform: [itask, ...], ...}
276278

277279
for itask in prepared_tasks:
278280
platform_name = itask.platform['name']
279281
auth_itasks.setdefault(platform_name, [])
280282
auth_itasks[platform_name].append(itask)
281283
# Submit task jobs for each platform
284+
# Non-prepared tasks can be considered done for now:
282285
done_tasks = bad_tasks
283286

284287
for _, itasks in sorted(auth_itasks.items()):
@@ -1087,7 +1090,7 @@ def _prep_submit_task_job(
10871090
Returns:
10881091
* itask - preparation complete.
10891092
* None - preparation in progress.
1090-
* False - perparation failed.
1093+
* False - preparation failed.
10911094
10921095
"""
10931096
if itask.local_job_file_path:
@@ -1181,6 +1184,14 @@ def _prep_submit_task_job(
11811184
itask.summary['platforms_used'][itask.submit_num] = ''
11821185
# Retry delays, needed for the try_num
11831186
self._create_job_log_path(workflow, itask)
1187+
if isinstance(exc, NoPlatformsError):
1188+
# Clear all hosts from all platforms in group from
1189+
# bad_hosts:
1190+
self.bad_hosts -= exc.hosts_consumed
1191+
self._set_retry_timers(itask, rtconfig)
1192+
self._prep_submit_task_job_error(
1193+
workflow, itask, '(no platforms available)', exc)
1194+
return False
11841195
self._prep_submit_task_job_error(
11851196
workflow, itask, '(platform not defined)', exc)
11861197
return False

cylc/flow/task_remote_mgr.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,10 @@ def remote_tidy(self) -> None:
388388
else:
389389
LOG.error(
390390
NoPlatformsError(
391-
install_target, 'install target', 'remote tidy'))
391+
install_target,
392+
set(),
393+
'install target',
394+
'remote tidy'))
392395
# Wait for commands to complete for a max of 10 seconds
393396
timeout = time() + 10.0
394397
while queue and time() < timeout:
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE.
2+
# Copyright (C) NIWA & British Crown (Met Office) & Contributors.
3+
#
4+
# This program is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU General Public License as published by
6+
# the Free Software Foundation, either version 3 of the License, or
7+
# (at your option) any later version.
8+
#
9+
# This program is distributed in the hope that it will be useful,
10+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
# GNU General Public License for more details.
13+
#
14+
# You should have received a copy of the GNU General Public License
15+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
16+
"""Integration testing for platforms functionality."""
17+
18+
19+
async def test_prep_submit_task_tries_multiple_platforms(
20+
flow, scheduler, start, mock_glbl_cfg
21+
):
22+
"""Preparation tries multiple platforms within a group if the
23+
task platform setting matches a group, and that after all platforms
24+
have been tried that the hosts matching that platform group are
25+
cleared.
26+
27+
See https://github.com/cylc/cylc-flow/pull/6109
28+
"""
29+
global_conf = '''
30+
[platforms]
31+
[[myplatform]]
32+
hosts = broken
33+
[[anotherbad]]
34+
hosts = broken2
35+
[platform groups]
36+
[[mygroup]]
37+
platforms = myplatform, anotherbad'''
38+
mock_glbl_cfg('cylc.flow.platforms.glbl_cfg', global_conf)
39+
40+
wid = flow({
41+
"scheduling": {"graph": {"R1": "foo"}},
42+
"runtime": {"foo": {"platform": "mygroup"}}
43+
})
44+
schd = scheduler(wid, run_mode='live')
45+
async with start(schd):
46+
itask = schd.pool.get_tasks()[0]
47+
itask.submit_num = 1
48+
# simulate failed attempts to contact the job hosts
49+
schd.task_job_mgr.bad_hosts = {'broken', 'broken2'}
50+
res = schd.task_job_mgr._prep_submit_task_job(schd.workflow, itask)
51+
assert res is False
52+
# ensure the bad hosts have been cleared
53+
assert not schd.task_job_mgr.bad_hosts

0 commit comments

Comments
 (0)