diff --git a/.gitignore b/.gitignore index 092d3741e..ee5d8e77b 100644 --- a/.gitignore +++ b/.gitignore @@ -89,3 +89,6 @@ Thumbs.db install/ results/ .* + +# conda +env/ \ No newline at end of file diff --git a/README.md b/README.md index 324ac70b8..8c0b30a0c 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ These schemas enable CloudAI to be flexible and compatible with different system |Sleep|✅|✅|✅| |UCC|✅|❌|❌| |SlurmContainer|✅|❌|❌| +|SlurmRayContainer|✅|❌|❌| |MegatronRun (experimental)|✅|❌|❌| diff --git a/conf/common/test/slurm_ray_container_vllm.toml b/conf/common/test/slurm_ray_container_vllm.toml new file mode 100644 index 000000000..ff3835393 --- /dev/null +++ b/conf/common/test/slurm_ray_container_vllm.toml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "slurm_ray_container_vllm" +description = "Run example script with vLLM" +test_template_name = "SlurmRayContainer" + +[cmd_args] +docker_image_url = "vllm/vllm-openai:latest" +cmd = "python3 examples/offline_inference/llm_engine_example.py -tp 8 -pp 2" diff --git a/conf/common/test_scenario/sleep.toml b/conf/common/test_scenario/sleep.toml index daca397c4..2d3b06597 100644 --- a/conf/common/test_scenario/sleep.toml +++ b/conf/common/test_scenario/sleep.toml @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/conf/common/test_scenario/slurm_ray_container.toml b/conf/common/test_scenario/slurm_ray_container.toml new file mode 100644 index 000000000..73e4506ff --- /dev/null +++ b/conf/common/test_scenario/slurm_ray_container.toml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "slurm_ray_container_example" + +[[Tests]] +id = "Tests.1" +test_name = "slurm_ray_container_vllm" +num_nodes = "2" diff --git a/pyproject.toml b/pyproject.toml index 12b5d2f18..4bb5dfcd8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "toml==0.10.2", "kubernetes==30.1.0", "pydantic==2.8.2", + "jinja2==3.1.6", ] [project.scripts] cloudai = "cloudai.__main__:main" diff --git a/requirements.txt b/requirements.txt index ddaf06e25..c585ee021 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ tbparse==0.0.8 toml==0.10.2 kubernetes==30.1.0 pydantic==2.8.2 +jinja2==3.1.6 diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index 4c6d1210c..f33d077fc 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -97,6 +97,7 @@ SleepTestDefinition, ) from .workloads.slurm_container import SlurmContainerCommandGenStrategy, SlurmContainerTestDefinition +from .workloads.slurm_ray_container import SlurmRayContainerCommandGenStrategy, SlurmRayContainerTestDefinition from .workloads.ucc_test import ( UCCTestDefinition, UCCTestGradingStrategy, @@ -156,6 +157,7 @@ SleepTestDefinition, NeMoRunTestDefinition, SlurmContainerTestDefinition, + SlurmRayContainerTestDefinition, MegatronRunTestDefinition, ], SlurmJobIdRetrievalStrategy, @@ -191,6 +193,7 @@ SleepTestDefinition, NeMoRunTestDefinition, SlurmContainerTestDefinition, + SlurmRayContainerTestDefinition, MegatronRunTestDefinition, ], DefaultJobStatusRetrievalStrategy, @@ -207,6 +210,9 @@ Registry().add_strategy( CommandGenStrategy, [SlurmSystem], [SlurmContainerTestDefinition], SlurmContainerCommandGenStrategy ) +Registry().add_strategy( + CommandGenStrategy, [SlurmSystem], [SlurmRayContainerTestDefinition], SlurmRayContainerCommandGenStrategy +) Registry().add_installer("slurm", SlurmInstaller) Registry().add_installer("standalone", StandaloneInstaller) @@ -226,6 +232,7 @@ Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition) Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition) Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition) +Registry().add_test_definition("SlurmRayContainer", SlurmRayContainerTestDefinition) Registry().add_test_definition("MegatronRun", MegatronRunTestDefinition) Registry().add_agent("grid_search", GridSearchAgent) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 76c1d1dc1..ce41debb9 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -36,6 +36,10 @@ from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition +from cloudai.workloads.slurm_ray_container import ( + SlurmRayContainerReportGenerationStrategy, + SlurmRayContainerTestDefinition, +) from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy from .exceptions import TestScenarioParsingError, format_validation_error @@ -54,6 +58,7 @@ NemotronTestDefinition: {JaxToolboxReportGenerationStrategy}, SleepTestDefinition: {SleepReportGenerationStrategy}, SlurmContainerTestDefinition: {SlurmContainerReportGenerationStrategy}, + SlurmRayContainerTestDefinition: {SlurmRayContainerReportGenerationStrategy}, UCCTestDefinition: {UCCTestReportGenerationStrategy}, } diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 2f57ab802..416401178 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -310,37 +310,61 @@ def _write_sbatch_script( return f"sbatch {batch_script_path}" - def _append_sbatch_directives( - self, batch_script_content: List[str], args: Dict[str, Any], output_path: Path - ) -> None: + def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dict[str, str]: """ - Append SBATCH directives to the batch script content. + Get the Slurm batch script directives. Args: - batch_script_content (List[str]): The list of script lines to append to. - args (Dict[str, Any]): Arguments including job settings. + args (Dict[str, Any]): Slurm-specific arguments. output_path (Path): Output directory for script and logs. + + Returns: + Dict[str, str]: Dictionary of Slurm batch script directives. """ - batch_script_content = self._add_reservation(batch_script_content) + sbatch_directives: Dict[str, str] = {} if "output" not in args: - batch_script_content.append(f"#SBATCH --output={output_path / 'stdout.txt'}") + sbatch_directives["output"] = f"{output_path / 'stdout.txt'}" if "error" not in args: - batch_script_content.append(f"#SBATCH --error={output_path / 'stderr.txt'}") - batch_script_content.append(f"#SBATCH --partition={self.system.default_partition}") + sbatch_directives["error"] = f"{output_path / 'stderr.txt'}" + + sbatch_directives["partition"] = self.system.default_partition + if args["node_list_str"]: - batch_script_content.append(f"#SBATCH --nodelist={args['node_list_str']}") + sbatch_directives["nodelist"] = args["node_list_str"] if self.system.account: - batch_script_content.append(f"#SBATCH --account={self.system.account}") + sbatch_directives["account"] = self.system.account if self.system.distribution: - batch_script_content.append(f"#SBATCH --distribution={self.system.distribution}") + sbatch_directives["distribution"] = self.system.distribution if self.system.gpus_per_node: - batch_script_content.append(f"#SBATCH --gpus-per-node={self.system.gpus_per_node}") - batch_script_content.append(f"#SBATCH --gres=gpu:{self.system.gpus_per_node}") + sbatch_directives["gpus_per_node"] = str(self.system.gpus_per_node) + sbatch_directives["gres"] = f"gpu:{self.system.gpus_per_node}" if self.system.ntasks_per_node: - batch_script_content.append(f"#SBATCH --ntasks-per-node={self.system.ntasks_per_node}") + sbatch_directives["ntasks_per_node"] = str(self.system.ntasks_per_node) if "time_limit" in args: - batch_script_content.append(f"#SBATCH --time={args['time_limit']}") + sbatch_directives["time_limit"] = args["time_limit"] + + return sbatch_directives + + def _append_sbatch_directives( + self, batch_script_content: List[str], args: Dict[str, Any], output_path: Path + ) -> None: + """ + Append SBATCH directives to the batch script content. + + Args: + batch_script_content (List[str]): The list of script lines to append to. + args (Dict[str, Any]): Arguments including job settings. + output_path (Path): Output directory for script and logs. + """ + batch_script_content = self._add_reservation(batch_script_content) + sbatch_directives = self._get_sbatch_directives(args, output_path) + + for key, value in sbatch_directives.items(): + if value: + batch_script_content.append(f"#SBATCH --{key}={value}") + else: + batch_script_content.append(f"#SBATCH --{key}") for arg in self.system.extra_sbatch_args: batch_script_content.append(f"#SBATCH {arg}") diff --git a/src/cloudai/workloads/slurm_ray_container/__init__.py b/src/cloudai/workloads/slurm_ray_container/__init__.py new file mode 100644 index 000000000..875c47bb0 --- /dev/null +++ b/src/cloudai/workloads/slurm_ray_container/__init__.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .report_generation_strategy import SlurmRayContainerReportGenerationStrategy +from .slurm_command_gen_strategy import SlurmRayContainerCommandGenStrategy +from .slurm_ray_container import SlurmRayContainerCmdArgs, SlurmRayContainerTestDefinition + +__all__ = [ + "SlurmRayContainerCmdArgs", + "SlurmRayContainerCommandGenStrategy", + "SlurmRayContainerReportGenerationStrategy", + "SlurmRayContainerTestDefinition", +] diff --git a/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py b/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py new file mode 100644 index 000000000..d8978e8da --- /dev/null +++ b/src/cloudai/workloads/slurm_ray_container/report_generation_strategy.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from cloudai import ReportGenerationStrategy + + +class SlurmRayContainerReportGenerationStrategy(ReportGenerationStrategy): + """Report generation strategy for a generic Slurm ray container test.""" + + def can_handle_directory(self) -> bool: + return False + + def generate_report(self) -> None: + pass diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py new file mode 100644 index 000000000..e6419d0f4 --- /dev/null +++ b/src/cloudai/workloads/slurm_ray_container/slurm_command_gen_strategy.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Any, Dict, List, Union, cast + +from jinja2 import Template + +from cloudai import TestRun +from cloudai.workloads.slurm_container import SlurmContainerCommandGenStrategy + +from .slurm_ray_container import SlurmRayContainerTestDefinition + + +class SlurmRayContainerCommandGenStrategy(SlurmContainerCommandGenStrategy): + """Command generation strategy for generic Slurm container tests.""" + + def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dict[str, str]: + sbatch_directives = super()._get_sbatch_directives(args, output_path) + # TODO(Amey): We probably need to figure out what to do with cpus-per-task, mem-per-cpu + # override tasks per node + sbatch_directives["tasks-per-node"] = "2" + sbatch_directives["exclusive"] = "" + + return sbatch_directives + + def _gen_srun_command( + self, + slurm_args: Dict[str, Any], + env_vars: Dict[str, str], + cmd_args: Dict[str, Union[str, List[str]]], + tr: TestRun, + ) -> str: + srun_command_parts = self.gen_srun_prefix(slurm_args, tr) + nsys_command_parts = super().gen_nsys_command(tr) + cmd_args["srun_command_prefix"] = " ".join(srun_command_parts + nsys_command_parts) + test_command_parts = self.generate_test_command(env_vars, cmd_args, tr) + return " ".join(test_command_parts) + + def generate_test_command( + self, env_vars: dict[str, str], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun + ) -> list[str]: + tdef: SlurmRayContainerTestDefinition = cast(SlurmRayContainerTestDefinition, tr.test.test_definition) + + command_parts: list[str] = [tdef.cmd_args.cmd] + if tr.test.extra_cmd_args: + command_parts.append(tr.test.extra_cmd_args) + + # load the jinja template file which is placed at the same directory as this file + script_dir = Path(__file__).parent + template_path = script_dir / "slurm_ray_container_template.sh.jinja" + template = Template(template_path.read_text()) + + conda_activate_command = f"conda activate {tdef.cmd_args.conda_env} && " if tdef.cmd_args.conda_env else "" + + # render the template + rendered_template = template.render( + { + "conda_activate_command": conda_activate_command, + "command": " ".join(command_parts), + "srun_command_prefix": cmd_args["srun_command_prefix"], + } + ) + + return [rendered_template] diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py new file mode 100644 index 000000000..742aeb6b2 --- /dev/null +++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from cloudai import CmdArgs, DockerImage, Installable, TestDefinition + + +class SlurmRayContainerCmdArgs(CmdArgs): + """Command line arguments for a generic Slurm container test.""" + + docker_image_url: str + cmd: str + conda_env: Optional[str] = None + + +class SlurmRayContainerTestDefinition(TestDefinition): + """Test definition for a generic Slurm container test.""" + + cmd_args: SlurmRayContainerCmdArgs + + _docker_image: Optional[DockerImage] = None + + @property + def docker_image(self) -> DockerImage: + if not self._docker_image: + self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) + return self._docker_image + + @property + def installables(self) -> list[Installable]: + return [self.docker_image, *self.git_repos] + + @property + def extra_args_str(self) -> str: + parts = [] + for k, v in self.extra_cmd_args.items(): + parts.append(f"{k} {v}" if v else k) + return " ".join(parts) diff --git a/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja new file mode 100644 index 000000000..f22094f46 --- /dev/null +++ b/src/cloudai/workloads/slurm_ray_container/slurm_ray_container_template.sh.jinja @@ -0,0 +1,31 @@ +port=6379 +ip_head=$head_node_ip:$port +export ip_head +echo "IP Head: $ip_head" + +echo "Starting HEAD at $head_node" +{{ srun_command_prefix }} --nodes=1 --ntasks=1 -w "$head_node" \ + {{ conda_activate_command }} \ + ray start --head --node-ip-address="$head_node_ip" --port=$port \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block & + +# optional, though may be useful in certain versions of Ray < 1.0. +sleep 10 + +# number of nodes other than the head node +worker_num=$((SLURM_JOB_NUM_NODES - 1)) + +for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + echo "Starting WORKER $i at $node_i" + {{ srun_command_prefix }} --nodes=1 --ntasks=1 -w "$node_i" \ + {{ conda_activate_command }} \ + ray start --address "$ip_head" \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK}" --block & + sleep 5 +done + +{{ srun_command_prefix }} --nodes=1 --ntasks=1 \ + -w "$head_node" --gpus-per-node=0 \ + {{ conda_activate_command }} \ + {{ command }} diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index cd7d0648d..31978cb0f 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -51,6 +51,11 @@ SlurmContainerCommandGenStrategy, SlurmContainerTestDefinition, ) +from cloudai.workloads.slurm_ray_container import ( + SlurmRayContainerCmdArgs, + SlurmRayContainerCommandGenStrategy, + SlurmRayContainerTestDefinition, +) from cloudai.workloads.ucc_test import UCCCmdArgs, UCCTestDefinition, UCCTestSlurmCommandGenStrategy SLURM_TEST_SCENARIOS = [ @@ -261,6 +266,20 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - ), SlurmContainerCommandGenStrategy, ), + "slurm_ray_container": lambda: create_test_run( + partial_tr, + slurm_system, + "slurm_ray_container", + SlurmRayContainerTestDefinition( + name="slurm_ray_container", + description="slurm_ray_container", + test_template_name="slurm_ray_container", + cmd_args=SlurmRayContainerCmdArgs( + docker_image_url="https://docker/url", cmd="pwd ; ls", conda_env="test" + ), + ), + SlurmRayContainerCommandGenStrategy, + ), "megatron-run": lambda: create_test_run( partial_tr, slurm_system, diff --git a/tests/test_init.py b/tests/test_init.py index bb3dd95d0..0ffcd1afe 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -66,6 +66,10 @@ SleepTestDefinition, ) from cloudai.workloads.slurm_container import SlurmContainerCommandGenStrategy, SlurmContainerTestDefinition +from cloudai.workloads.slurm_ray_container import ( + SlurmRayContainerCommandGenStrategy, + SlurmRayContainerTestDefinition, +) from cloudai.workloads.ucc_test import ( UCCTestDefinition, UCCTestGradingStrategy, @@ -99,6 +103,7 @@ def test_runners(): (CommandGenStrategy, SlurmSystem, NemotronTestDefinition): JaxToolboxSlurmCommandGenStrategy, (CommandGenStrategy, SlurmSystem, SleepTestDefinition): SleepSlurmCommandGenStrategy, (CommandGenStrategy, SlurmSystem, SlurmContainerTestDefinition): SlurmContainerCommandGenStrategy, + (CommandGenStrategy, SlurmSystem, SlurmRayContainerTestDefinition): SlurmRayContainerCommandGenStrategy, (CommandGenStrategy, SlurmSystem, UCCTestDefinition): UCCTestSlurmCommandGenStrategy, (CommandGenStrategy, SlurmSystem, MegatronRunTestDefinition): MegatronRunSlurmCommandGenStrategy, (CommandGenStrategy, StandaloneSystem, SleepTestDefinition): SleepStandaloneCommandGenStrategy, @@ -119,6 +124,7 @@ def test_runners(): (JobIdRetrievalStrategy, SlurmSystem, NemotronTestDefinition): SlurmJobIdRetrievalStrategy, (JobIdRetrievalStrategy, SlurmSystem, SleepTestDefinition): SlurmJobIdRetrievalStrategy, (JobIdRetrievalStrategy, SlurmSystem, SlurmContainerTestDefinition): SlurmJobIdRetrievalStrategy, + (JobIdRetrievalStrategy, SlurmSystem, SlurmRayContainerTestDefinition): SlurmJobIdRetrievalStrategy, (JobIdRetrievalStrategy, SlurmSystem, UCCTestDefinition): SlurmJobIdRetrievalStrategy, (JobIdRetrievalStrategy, SlurmSystem, MegatronRunTestDefinition): SlurmJobIdRetrievalStrategy, (JobIdRetrievalStrategy, StandaloneSystem, SleepTestDefinition): StandaloneJobIdRetrievalStrategy, @@ -133,6 +139,7 @@ def test_runners(): (JobStatusRetrievalStrategy, SlurmSystem, NemotronTestDefinition): JaxToolboxJobStatusRetrievalStrategy, (JobStatusRetrievalStrategy, SlurmSystem, SleepTestDefinition): DefaultJobStatusRetrievalStrategy, (JobStatusRetrievalStrategy, SlurmSystem, SlurmContainerTestDefinition): DefaultJobStatusRetrievalStrategy, + (JobStatusRetrievalStrategy, SlurmSystem, SlurmRayContainerTestDefinition): DefaultJobStatusRetrievalStrategy, (JobStatusRetrievalStrategy, SlurmSystem, UCCTestDefinition): DefaultJobStatusRetrievalStrategy, (JobStatusRetrievalStrategy, SlurmSystem, MegatronRunTestDefinition): DefaultJobStatusRetrievalStrategy, (JobStatusRetrievalStrategy, StandaloneSystem, SleepTestDefinition): DefaultJobStatusRetrievalStrategy, @@ -165,7 +172,7 @@ def test_installers(): def test_definitions(): test_defs = Registry().test_definitions_map - assert len(test_defs) == 11 + assert len(test_defs) == 12 for tdef in [ ("UCCTest", UCCTestDefinition), ("NcclTest", NCCLTestDefinition), @@ -177,6 +184,7 @@ def test_definitions(): ("JaxToolboxGrok", GrokTestDefinition), ("JaxToolboxNemotron", NemotronTestDefinition), ("SlurmContainer", SlurmContainerTestDefinition), + ("SlurmRayContainer", SlurmRayContainerTestDefinition), ("MegatronRun", MegatronRunTestDefinition), ]: assert test_defs[tdef[0]] == tdef[1] diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py index aeab2e0c3..c2208068f 100644 --- a/tests/test_test_scenario.py +++ b/tests/test_test_scenario.py @@ -44,6 +44,10 @@ from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition +from cloudai.workloads.slurm_ray_container import ( + SlurmRayContainerReportGenerationStrategy, + SlurmRayContainerTestDefinition, +) from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy from tests.conftest import MyTestDefinition @@ -278,7 +282,7 @@ def test_default(self): assert len(reporters) == 0 def test_default_reporters_size(self): - assert len(DEFAULT_REPORTERS) == 11 + assert len(DEFAULT_REPORTERS) == 12 @pytest.mark.parametrize( "tdef,expected_reporters", @@ -293,6 +297,7 @@ def test_default_reporters_size(self): (NemotronTestDefinition, {JaxToolboxReportGenerationStrategy}), (SleepTestDefinition, {SleepReportGenerationStrategy}), (SlurmContainerTestDefinition, {SlurmContainerReportGenerationStrategy}), + (SlurmRayContainerTestDefinition, {SlurmRayContainerReportGenerationStrategy}), (UCCTestDefinition, {UCCTestReportGenerationStrategy}), ], )