Skip to content

Commit a8498fb

Browse files
kiukchungfacebook-github-bot
authored andcommitted
(monarch/tools) Add force_restart argument to get_or_create command (#783)
Summary: Pull Request resolved: #783 `force_restart=True` will kill the existing job and force it to create again. Useful when we want to redeploy the server to reflect any changes in the local conda/workspace that are not eligible for CodeSync. NOTE: this is tangential to setting `workspace=None` which disables local conda AND workspace building into an ephemeral image. Reviewed By: ahmadsharif1 Differential Revision: D79754696 fbshipit-source-id: 085aa2f1633d5b4ebac375a8494fc754a532d4a1
1 parent e704f43 commit a8498fb

File tree

2 files changed

+51
-0
lines changed

2 files changed

+51
-0
lines changed

python/monarch/tools/commands.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ async def get_or_create(
265265
name: str,
266266
config: Config,
267267
check_interval: timedelta = _5_SECONDS,
268+
force_restart: bool = False,
268269
) -> ServerSpec:
269270
"""Waits for the server based on identity `name` in the scheduler specified in the `config`
270271
to be ready (e.g. RUNNING). If the server is not found then this function creates one
@@ -286,6 +287,7 @@ async def get_or_create(
286287
name: the name of the server (job) to get or create
287288
config: configs used to create the job if one does not exist
288289
check_interval: how often to poll the status of the job when waiting for it to be ready
290+
force_restart: if True kills and re-creates the job even if one exists
289291
290292
Returns: A `ServerSpec` containing information about either the existing or the newly
291293
created server.
@@ -322,6 +324,12 @@ async def get_or_create(
322324
return server_info
323325
else:
324326
print(f"{CYAN}Found existing job `{server_handle}` ready to serve.{ENDC}")
327+
328+
if force_restart:
329+
print(f"{CYAN}force_restart=True, restarting `{server_handle}`.{ENDC}")
330+
kill(server_handle)
331+
server_info = await get_or_create(name, config, check_interval)
332+
325333
return server_info
326334

327335

python/tests/tools/test_commands.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
CMD_INFO = "monarch.tools.commands.info"
2626
CMD_CREATE = "monarch.tools.commands.create"
27+
CMD_KILL = "monarch.tools.commands.kill"
2728

2829

2930
class TestCommands(unittest.TestCase):
@@ -336,3 +337,45 @@ async def test_get_or_create_new_server_missing(
336337
config=config,
337338
check_interval=_5_MS,
338339
)
340+
341+
async def test_get_or_create_force_restart(self) -> None:
342+
with mock.patch(
343+
CMD_INFO,
344+
side_effect=[
345+
# -- state for slurm:///123
346+
server(AppState.RUNNING, name="123"),
347+
# -- force_restart kills the server
348+
server(AppState.CANCELLED, name="123"),
349+
# -- states for (new) slurm:///456
350+
server(AppState.SUBMITTED, name="456"),
351+
server(AppState.PENDING, name="456"),
352+
server(AppState.RUNNING, name="456"),
353+
],
354+
) as mock_info, mock.patch(
355+
CMD_CREATE, return_value="slurm:///456"
356+
) as mock_create, mock.patch(CMD_KILL) as mock_kill:
357+
config = Config(
358+
scheduler="slurm",
359+
scheduler_args={},
360+
appdef=defaults.component_fn("slurm")(),
361+
)
362+
server_info = await commands.get_or_create(
363+
name="123",
364+
config=config,
365+
check_interval=_5_MS,
366+
force_restart=True,
367+
)
368+
369+
mock_create.called_once_with(config, "123")
370+
mock_kill.assert_called_once_with("slurm:///123")
371+
self.assertEqual(server_info.server_handle, "slurm:///456")
372+
self.assertListEqual(
373+
mock_info.call_args_list,
374+
[
375+
mock.call("slurm:///123"),
376+
mock.call("slurm:///123"),
377+
mock.call("slurm:///456"),
378+
mock.call("slurm:///456"),
379+
mock.call("slurm:///456"),
380+
],
381+
)

0 commit comments

Comments
 (0)