Skip to content

Commit 4ea3fbc

Browse files
authored
Make deliberately scaled-in unstarted blocks not be failures (#3594)
This PR adds a new terminal job state, SCALED_IN. None of the existing providers will return it, but the scaling layer will use it to mark a job as deliberately scaled in, so that error handling code will not regard it as failed. Fixes #3568
1 parent 789ee82 commit 4ea3fbc

File tree

6 files changed

+102
-10
lines changed

6 files changed

+102
-10
lines changed

parsl/executors/high_throughput/executor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -790,7 +790,8 @@ def status(self) -> Dict[str, JobStatus]:
790790
connected_blocks = self.connected_blocks()
791791
for job_id in job_status:
792792
job_info = job_status[job_id]
793-
if job_info.terminal and job_id not in connected_blocks:
793+
if job_info.terminal and job_id not in connected_blocks and job_info.state != JobState.SCALED_IN:
794+
logger.debug("Rewriting job %s from status %s to MISSING", job_id, job_info)
794795
job_status[job_id].state = JobState.MISSING
795796
if job_status[job_id].message is None:
796797
job_status[job_id].message = (

parsl/executors/status_handling.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,10 @@ def scale_in_facade(self, n: int, max_idletime: Optional[float] = None) -> List[
347347
if block_ids is not None:
348348
new_status = {}
349349
for block_id in block_ids:
350-
new_status[block_id] = JobStatus(JobState.CANCELLED)
351-
del self._status[block_id]
350+
logger.debug("Marking block %s as SCALED_IN", block_id)
351+
s = JobStatus(JobState.SCALED_IN)
352+
new_status[block_id] = s
353+
self._status[block_id] = s
354+
self._simulated_status[block_id] = s
352355
self.send_monitoring_info(new_status)
353356
return block_ids

parsl/jobs/states.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,17 @@ class JobState(IntEnum):
4646
bad worker environment or network connectivity issues.
4747
"""
4848

49+
SCALED_IN = 9
50+
"""This job has been deliberately scaled in. Scaling code should not be concerned
51+
that the job never ran (for example for error handling purposes).
52+
"""
53+
4954
def __str__(self) -> str:
5055
return f"{self.__class__.__name__}.{self.name}"
5156

5257

5358
TERMINAL_STATES = [JobState.CANCELLED, JobState.COMPLETED, JobState.FAILED,
54-
JobState.TIMEOUT, JobState.MISSING]
59+
JobState.TIMEOUT, JobState.MISSING, JobState.SCALED_IN]
5560

5661

5762
class JobStatus:

parsl/tests/test_htex/test_multiple_disconnected_blocks.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,14 @@ def local_config():
2121
poll_period=100,
2222
max_workers_per_node=1,
2323
provider=LocalProvider(
24-
worker_init="conda deactivate; export PATH=''; which python; exit 0",
25-
init_blocks=2,
26-
max_blocks=4,
27-
min_blocks=0,
24+
worker_init="exit 0",
25+
init_blocks=2
2826
),
2927
)
3028
],
3129
run_dir="/tmp/test_htex",
3230
max_idletime=0.5,
33-
strategy='htex_auto_scale',
31+
strategy='none',
3432
)
3533

3634

parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,6 @@ def test_row_counts(tmpd_cwd, strategy):
7878
(c, ) = result.first()
7979
assert c == 1, "There should be a single pending status"
8080

81-
result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = 'CANCELLED' AND run_id = :run_id"), binds)
81+
result = connection.execute(text("SELECT COUNT(*) FROM block WHERE block_id = 0 AND status = 'SCALED_IN' AND run_id = :run_id"), binds)
8282
(c, ) = result.first()
8383
assert c == 1, "There should be a single cancelled status"
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import time
2+
3+
import pytest
4+
5+
import parsl
6+
from parsl.channels import LocalChannel
7+
from parsl.config import Config
8+
from parsl.executors import HighThroughputExecutor
9+
from parsl.launchers import WrappedLauncher
10+
from parsl.providers import LocalProvider
11+
12+
13+
def local_config():
14+
# see the comments inside test_regression for reasoning about why each
15+
# of these parameters is set why it is.
16+
return Config(
17+
max_idletime=1,
18+
19+
strategy='htex_auto_scale',
20+
strategy_period=1,
21+
22+
executors=[
23+
HighThroughputExecutor(
24+
label="htex_local",
25+
encrypted=True,
26+
provider=LocalProvider(
27+
init_blocks=1,
28+
min_blocks=0,
29+
max_blocks=1,
30+
launcher=WrappedLauncher(prepend="sleep inf ; "),
31+
),
32+
)
33+
],
34+
)
35+
36+
37+
@parsl.python_app
38+
def task():
39+
return 7
40+
41+
42+
@pytest.mark.local
43+
def test_regression(try_assert):
44+
# The above config means that we should start scaling out one initial
45+
# block, but then scale it back in after a second or so if the executor
46+
# is kept idle (which this test does using try_assert).
47+
48+
# Because of 'sleep inf' in the WrappedLaucher, the block will not ever
49+
# register.
50+
51+
# The bug being tested is about mistreatment of blocks which are scaled in
52+
# before they have a chance to register, and the above forces that to
53+
# happen.
54+
55+
# After that scaling in has happened, we should see that we have one block
56+
# and it should be in a terminal state. The below try_assert waits for
57+
# that to become true.
58+
59+
# At that time, we should also see htex reporting no blocks registered - as
60+
# mentioned above, that is a necessary part of the bug being tested here.
61+
62+
# Give 10 strategy periods for the above to happen: each step of scale up,
63+
# and scale down due to idleness isn't guaranteed to happen in exactly one
64+
# scaling step.
65+
66+
htex = parsl.dfk().executors['htex_local']
67+
68+
try_assert(lambda: len(htex.status_facade) == 1 and htex.status_facade['0'].terminal,
69+
timeout_ms=10000)
70+
71+
assert htex.connected_blocks() == [], "No block should have connected to interchange"
72+
73+
# Now we can reconfigure the launcher to let subsequent blocks launch ok,
74+
# and run a trivial task. That trivial task will scale up a new block and
75+
# run the task successfully.
76+
77+
# Prior to issue #3568, the bug was that the scale in of the first
78+
# block earlier in the test case would have incorrectly been treated as a
79+
# failure, and then the block error handler would have treated that failure
80+
# as a permanent htex failure, and so the task execution below would raise
81+
# a BadStateException rather than attempt to run the task.
82+
83+
assert htex.provider.launcher.prepend != "", "Pre-req: prepend attribute should exist and be non-empty"
84+
htex.provider.launcher.prepend = ""
85+
assert task().result() == 7

0 commit comments

Comments
 (0)