Skip to content

Commit 1adb499

Browse files
authored
Add node reservations for LeptonExecutor (#336)
Allow users to specify an existing node reservation with the LeptonExecutor to be able to run on dedicated resources. Signed-off-by: Robert Clark <[email protected]>
1 parent 178ab3c commit 1adb499

File tree

3 files changed

+136
-2
lines changed

3 files changed

+136
-2
lines changed

docs/source/guides/execution.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,12 @@ def your_lepton_executor(nodes: int, gpus_per_node: int, container_image: str):
295295
mounts=[{"path": storage_path, "mount_path": mount_path}],
296296
# Optional: Add custom environment variables or PyTorch specs if needed
297297
env_vars=common_envs(),
298+
# Optional: Specify a node reservation to schedule jobs with
299+
# node_reservation="my-node-reservation",
300+
# Optional: Specify commands to run at container launch prior to the job starting
301+
# pre_launch_commands=["nvidia-smi"],
302+
# Optional: Specify image pull secrets for authenticating with container registries
303+
# image_pull_secrets=["my-image-pull-secret"],
298304
# packager=run.GitArchivePackager() # Choose appropriate packager
299305
)
300306
return executor

nemo_run/core/execution/lepton.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,12 @@
2020
LeptonContainer,
2121
Mount,
2222
)
23-
from leptonai.api.v1.types.job import LeptonJob, LeptonJobState, LeptonJobUserSpec
23+
from leptonai.api.v1.types.job import (
24+
LeptonJob,
25+
LeptonJobState,
26+
LeptonJobUserSpec,
27+
ReservationConfig,
28+
)
2429
from leptonai.api.v1.types.replica import Replica
2530

2631
from nemo_run.config import get_nemorun_home
@@ -51,6 +56,7 @@ class LeptonExecutor(Executor):
5156
shared_memory_size: int = 65536
5257
resource_shape: str = ""
5358
node_group: str = ""
59+
node_reservation: str = ""
5460
mounts: list[dict[str, Any]] = field(default_factory=list)
5561
lepton_job_dir: str = field(init=False, default="")
5662
image_pull_secrets: list[str] = field(
@@ -260,8 +266,12 @@ def create_lepton_job(self, name: str):
260266
log=None,
261267
queue_config=None,
262268
stopped=None,
263-
reservation_config=None,
264269
)
270+
271+
if self.node_reservation:
272+
job_spec.reservation_config = ReservationConfig(reservation_id=self.node_reservation)
273+
job_spec.reservation_config.reservation_id = self.node_reservation
274+
265275
job = LeptonJob(spec=job_spec, metadata=Metadata(id=name))
266276

267277
created_job = client.job.create(job)

test/core/execution/test_lepton.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,42 @@ def test_init(self):
5959
assert executor.nemo_run_dir == "/workspace/nemo_run"
6060
assert executor.mounts == [{"path": "/workspace", "mount_path": "/workspace"}]
6161

62+
def test_init_with_node_reservation(self):
63+
"""Test initialization with node_reservation parameter."""
64+
executor = LeptonExecutor(
65+
resource_shape="gpu.8xh100-80gb",
66+
node_group="my-node-group",
67+
container_image="test-image",
68+
nodes=2,
69+
gpus_per_node=8,
70+
nemo_run_dir="/workspace/nemo_run",
71+
mounts=[{"path": "/workspace", "mount_path": "/workspace"}],
72+
node_reservation="my-reservation-id",
73+
)
74+
75+
assert executor.node_reservation == "my-reservation-id"
76+
77+
def test_init_with_empty_node_reservation(self):
78+
"""Test initialization with empty node_reservation string."""
79+
executor = LeptonExecutor(
80+
container_image="test-image",
81+
nemo_run_dir="/test/path",
82+
mounts=[{"path": "/test", "mount_path": "/test"}],
83+
node_reservation="",
84+
)
85+
86+
assert executor.node_reservation == ""
87+
88+
def test_init_without_node_reservation(self):
89+
"""Test initialization without node_reservation parameter (default behavior)."""
90+
executor = LeptonExecutor(
91+
container_image="test-image",
92+
nemo_run_dir="/test/path",
93+
mounts=[{"path": "/test", "mount_path": "/test"}],
94+
)
95+
96+
assert executor.node_reservation == ""
97+
6298
@patch("nemo_run.core.execution.lepton.APIClient")
6399
def test_stop_job(self, mock_APIClient):
64100
mock_instance = MagicMock()
@@ -344,6 +380,88 @@ def test_create_lepton_job(self, mock_APIClient_class):
344380

345381
mock_client.job.create.assert_called_once()
346382

383+
@patch("nemo_run.core.execution.lepton.APIClient")
384+
def test_create_lepton_job_with_reservation_config(self, mock_APIClient_class):
385+
"""Test create_lepton_job creates ReservationConfig when node_reservation is set."""
386+
mock_client = mock_APIClient_class.return_value
387+
mock_client.job.create.return_value = LeptonJob(metadata=Metadata(id="my-lepton-job"))
388+
node_group = SimpleNamespace(metadata=SimpleNamespace(id_="123456"))
389+
390+
mock_client.nodegroup.list_all.return_value = []
391+
valid_node_ids = ["node-id-1", "node-id-2"]
392+
393+
executor = LeptonExecutor(
394+
container_image="test-image",
395+
nemo_run_dir="/test/path",
396+
node_group="123456",
397+
mounts=[{"path": "/test", "mount_path": "/test"}],
398+
node_reservation="my-reservation-id",
399+
)
400+
executor._valid_node_ids = MagicMock(return_value=valid_node_ids)
401+
executor._node_group_id = MagicMock(return_value=node_group)
402+
403+
executor.create_lepton_job("my-lepton-job")
404+
405+
# Verify that job.create was called with the correct ReservationConfig
406+
mock_client.job.create.assert_called_once()
407+
created_job = mock_client.job.create.call_args[0][0]
408+
assert created_job.spec.reservation_config is not None
409+
assert created_job.spec.reservation_config.reservation_id == "my-reservation-id"
410+
411+
@patch("nemo_run.core.execution.lepton.APIClient")
412+
def test_create_lepton_job_without_reservation_config(self, mock_APIClient_class):
413+
"""Test create_lepton_job creates no ReservationConfig when node_reservation is not set."""
414+
mock_client = mock_APIClient_class.return_value
415+
mock_client.job.create.return_value = LeptonJob(metadata=Metadata(id="my-lepton-job"))
416+
node_group = SimpleNamespace(metadata=SimpleNamespace(id_="123456"))
417+
418+
mock_client.nodegroup.list_all.return_value = []
419+
valid_node_ids = ["node-id-1", "node-id-2"]
420+
421+
executor = LeptonExecutor(
422+
container_image="test-image",
423+
nemo_run_dir="/test/path",
424+
node_group="123456",
425+
mounts=[{"path": "/test", "mount_path": "/test"}],
426+
# No node_reservation set
427+
)
428+
executor._valid_node_ids = MagicMock(return_value=valid_node_ids)
429+
executor._node_group_id = MagicMock(return_value=node_group)
430+
431+
executor.create_lepton_job("my-lepton-job")
432+
433+
# Verify that job.create was called with no ReservationConfig
434+
mock_client.job.create.assert_called_once()
435+
created_job = mock_client.job.create.call_args[0][0]
436+
assert created_job.spec.reservation_config is None
437+
438+
@patch("nemo_run.core.execution.lepton.APIClient")
439+
def test_create_lepton_job_with_empty_reservation_config(self, mock_APIClient_class):
440+
"""Test create_lepton_job creates no ReservationConfig when node_reservation is empty string."""
441+
mock_client = mock_APIClient_class.return_value
442+
mock_client.job.create.return_value = LeptonJob(metadata=Metadata(id="my-lepton-job"))
443+
node_group = SimpleNamespace(metadata=SimpleNamespace(id_="123456"))
444+
445+
mock_client.nodegroup.list_all.return_value = []
446+
valid_node_ids = ["node-id-1", "node-id-2"]
447+
448+
executor = LeptonExecutor(
449+
container_image="test-image",
450+
nemo_run_dir="/test/path",
451+
node_group="123456",
452+
mounts=[{"path": "/test", "mount_path": "/test"}],
453+
node_reservation="", # Empty string
454+
)
455+
executor._valid_node_ids = MagicMock(return_value=valid_node_ids)
456+
executor._node_group_id = MagicMock(return_value=node_group)
457+
458+
executor.create_lepton_job("my-lepton-job")
459+
460+
# Verify that job.create was called with no ReservationConfig
461+
mock_client.job.create.assert_called_once()
462+
created_job = mock_client.job.create.call_args[0][0]
463+
assert created_job.spec.reservation_config is None
464+
347465
def test_nnodes(self):
348466
executor = LeptonExecutor(
349467
container_image="nvcr.io/nvidia/test:latest",

0 commit comments

Comments
 (0)