From 037f9d0f31caef245953f37cf8179755e3142412 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 4 Apr 2025 07:52:09 -0400 Subject: [PATCH 01/13] Generate Slurm files dynamically and fix issues in venv.sh --- pyproject.toml | 2 +- vec_inf/cli/_helper.py | 43 ++---- vec_inf/cli/_slurm_script_generator.py | 195 +++++++++++++++++++++++++ venv.sh | 9 +- 4 files changed, 216 insertions(+), 33 deletions(-) create mode 100644 vec_inf/cli/_slurm_script_generator.py mode change 100644 => 100755 venv.sh diff --git a/pyproject.toml b/pyproject.toml index def192fc..1dab6bee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "Efficient LLM inference on Slurm clusters using vLLM." readme = "README.md" authors = [{name = "Marshall Wang", email = "marshall.wang@vectorinstitute.ai"}] license = "MIT" -requires-python = ">=3.10" +requires-python = ">=3.10,<4.0" dependencies = [ "requests>=2.31.0", "click>=8.1.0", diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py index bd520ac1..3a9efe31 100644 --- a/vec_inf/cli/_helper.py +++ b/vec_inf/cli/_helper.py @@ -16,6 +16,7 @@ import vec_inf.cli._utils as utils from vec_inf.cli._config import ModelConfig +from vec_inf.cli._slurm_script_generator import SlurmScriptGenerator VLLM_TASK_MAP = { @@ -127,31 +128,7 @@ def _get_launch_params(self) -> dict[str, Any]: def set_env_vars(self) -> None: """Set environment variables for the launch command.""" - os.environ["MODEL_NAME"] = self.model_name - os.environ["MAX_MODEL_LEN"] = self.params["max_model_len"] - os.environ["MAX_LOGPROBS"] = self.params["vocab_size"] - os.environ["DATA_TYPE"] = self.params["data_type"] - os.environ["MAX_NUM_SEQS"] = self.params["max_num_seqs"] - os.environ["GPU_MEMORY_UTILIZATION"] = self.params["gpu_memory_utilization"] - os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]] - os.environ["PIPELINE_PARALLELISM"] = self.params["pipeline_parallelism"] - os.environ["COMPILATION_CONFIG"] = self.params["compilation_config"] - os.environ["SRC_DIR"] = SRC_DIR - os.environ["MODEL_WEIGHTS"] = str( - Path(self.params["model_weights_parent_dir"], self.model_name) - ) os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH - os.environ["VENV_BASE"] = self.params["venv"] - os.environ["LOG_DIR"] = self.params["log_dir"] - - if self.params.get("enable_prefix_caching"): - os.environ["ENABLE_PREFIX_CACHING"] = self.params["enable_prefix_caching"] - if self.params.get("enable_chunked_prefill"): - os.environ["ENABLE_CHUNKED_PREFILL"] = self.params["enable_chunked_prefill"] - if self.params.get("max_num_batched_tokens"): - os.environ["MAX_NUM_BATCHED_TOKENS"] = self.params["max_num_batched_tokens"] - if self.params.get("enforce_eager"): - os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"] def build_launch_command(self) -> str: """Construct the full launch command with parameters.""" @@ -177,11 +154,19 @@ def build_launch_command(self) -> str: ] ) # Add slurm script - slurm_script = "vllm.slurm" - if int(self.params["num_nodes"]) > 1: - slurm_script = "multinode_vllm.slurm" - command_list.append(f"{SRC_DIR}/{slurm_script}") + # slurm_script = "vllm.slurm" + # if int(self.params["num_nodes"]) > 1: + # slurm_script = "multinode_vllm.slurm" + # command_list.append(f"{SRC_DIR}/{slurm_script}") + + slurm_script_path = SlurmScriptGenerator( + self.params, src_dir=SRC_DIR, is_multinode=int(self.params["num_nodes"]) > 1 + ).write_to_log_dir() + + command_list.append(str(slurm_script_path)) return " ".join(command_list) + + def format_table_output(self, job_id: str) -> Table: """Format output as rich Table.""" @@ -214,7 +199,7 @@ def format_table_output(self, job_id: str) -> Table: ) if self.params.get("enforce_eager"): table.add_row("Enforce Eager", self.params["enforce_eager"]) - table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS")) + table.add_row("Model Weights Directory", str(Path(self.params["model_weights_parent_dir"], self.model_name))) table.add_row("Log Directory", self.params["log_dir"]) return table diff --git a/vec_inf/cli/_slurm_script_generator.py b/vec_inf/cli/_slurm_script_generator.py new file mode 100644 index 00000000..5387ebbb --- /dev/null +++ b/vec_inf/cli/_slurm_script_generator.py @@ -0,0 +1,195 @@ +from pathlib import Path + +VLLM_TASK_MAP = { + "LLM": "generate", + "VLM": "generate", + "Text_Embedding": "embed", + "Reward_Modeling": "reward", +} + +class SlurmScriptGenerator: + def __init__(self, params: dict, src_dir: str, is_multinode: bool = False): + self.params = params + self.src_dir = src_dir + self.is_multinode = is_multinode + self.model_weights_path = Path(params["model_weights_parent_dir"], params["model_name"]) + self.task = VLLM_TASK_MAP[self.params["model_type"]] + + def _generate_script_content(self) -> str: + return self._generate_multinode_script() if self.is_multinode else self._generate_single_node_script() + + def _generate_preamble(self, is_multinode: bool = False) -> str: + base = [ + "#!/bin/bash", + "#SBATCH --cpus-per-task=16", + "#SBATCH --mem=64G", + ] + if is_multinode: + base += [ + "#SBATCH --exclusive", + "#SBATCH --tasks-per-node=1", + ] + base += [f"source {self.src_dir}/find_port.sh", ""] + return "\n".join(base) + + def _export_parallel_vars(self) -> str: + if self.is_multinode: + return """if [ "$PIPELINE_PARALLELISM" = "True" ]; then +export PIPELINE_PARALLEL_SIZE=$SLURM_JOB_NUM_NODES +export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE +else +export PIPELINE_PARALLEL_SIZE=1 +export TENSOR_PARALLEL_SIZE=$((SLURM_JOB_NUM_NODES*SLURM_GPUS_PER_NODE)) +fi +""" + else: + return "export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE\n" + + def _generate_shared_args(self) -> list[str]: + args = [ + f"--model {self.model_weights_path} \\", + f"--served-model-name {self.params['model_name']} \\", + "--host \"0.0.0.0\" \\", + "--port $vllm_port_number \\", + "--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \\", + f"--dtype {self.params['data_type']} \\", + "--trust-remote-code \\", + f"--max-logprobs {self.params['vocab_size']} \\", + f"--max-model-len {self.params['max_model_len']} \\", + f"--max-num-seqs {self.params['max_num_seqs']} \\", + f"--gpu-memory-utilization {self.params['gpu_memory_utilization']} \\", + f"--compilation-config {self.params['compilation_config']} \\", + f"--task {self.task} \\", + ] + if self.is_multinode: + args.insert(4, "--pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \\") + if self.params.get("max_num_batched_tokens"): + args.append(f"--max-num-batched-tokens={self.params['max_num_batched_tokens']} \\") + if self.params.get("enable_prefix_caching") == "True": + args.append("--enable-prefix-caching \\") + if self.params.get("enable_chunked_prefill") == "True": + args.append("--enable-chunked-prefill \\") + if self.params.get("enforce_eager") == "True": + args.append("--enforce-eager") + + return args + + def _generate_single_node_script(self) -> str: + preamble = self._generate_preamble(is_multinode=False) + + server = f"""hostname=${{SLURMD_NODENAME}} +vllm_port_number=$(find_available_port ${{hostname}} 8080 65535) + +SERVER_ADDR="http://${{hostname}}:${{vllm_port_number}}/v1" +echo "Server address: $SERVER_ADDR" + +JSON_PATH="{self.params['log_dir']}/{self.params['model_name']}.$SLURM_JOB_ID/{self.params['model_name']}.$SLURM_JOB_ID.json" +echo "Updating server address in $JSON_PATH" +jq --arg server_addr "$SERVER_ADDR" \\ + '. + {{"server_address": $server_addr}}' \\ + "$JSON_PATH" > temp.json \\ + && mv temp.json "$JSON_PATH" \\ + && rm -f temp.json +""" + + env_exports = self._export_parallel_vars() + + if self.params["venv"] == "singularity": + launcher = f"""export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif +export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 +module load singularity-ce/3.8.2 +singularity exec $SINGULARITY_IMAGE ray stop +singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\ +python3.10 -m vllm.entrypoints.openai.api_server \\ +""" + else: + launcher = f"""source {self.params['venv']}/bin/activate +python3 -m vllm.entrypoints.openai.api_server \\ +""" + + args = "\n".join(self._generate_shared_args()) + return preamble + server + env_exports + launcher + args + + + def _generate_multinode_script(self) -> str: + preamble = self._generate_preamble(is_multinode=True) + + cluster_setup = [] + if self.params["venv"] == "singularity": + cluster_setup.append(f"""export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif +export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 +module load singularity-ce/3.8.2 +singularity exec $SINGULARITY_IMAGE ray stop +""") + + cluster_setup.append(f"""nodes=$(scontrol show hostnames "${{SLURM_JOB_NODELIST}}") +nodes_array=(${{nodes}}) + +head_node=${{nodes_array[0]}} +head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) + +head_node_port=$(find_available_port $head_node_ip 8080 65535) +vllm_port_number=$(find_available_port $head_node_ip 8080 65535) + +ip_head=$head_node_ip:$head_node_port +export ip_head +echo "IP Head: $ip_head" + +echo "Starting HEAD at $head_node" +srun --nodes=1 --ntasks=1 -w "$head_node" \\""") + + if self.params["venv"] == "singularity": + cluster_setup.append(f""" singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\""") + + cluster_setup.append(f""" ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \\ + --num-cpus "${{SLURM_CPUS_PER_TASK}}" --num-gpus "${{SLURM_GPUS_PER_NODE}}" --block & + +sleep 10 +worker_num=$((SLURM_JOB_NUM_NODES - 1)) + +for ((i = 1; i <= worker_num; i++)); do + node_i=${{nodes_array[$i]}} + echo "Starting WORKER $i at $node_i" + srun --nodes=1 --ntasks=1 -w "$node_i" \\""") + + if self.params["venv"] == "singularity": + cluster_setup.append(f""" singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\""") + cluster_setup.append(f""" ray start --address "$ip_head" \\ + --num-cpus "${{SLURM_CPUS_PER_TASK}}" --num-gpus "${{SLURM_GPUS_PER_NODE}}" --block & + sleep 5 +done + +SERVER_ADDR="http://$head_node_ip:$vllm_port_number/v1" +echo "Server address: $SERVER_ADDR" + +JSON_PATH="{self.params['log_dir']}/{self.params['model_name']}.$SLURM_JOB_ID/{self.params['model_name']}.$SLURM_JOB_ID.json" +echo "Updating server address in $JSON_PATH" +jq --arg server_addr "$SERVER_ADDR" \\ + '. + {{"server_address": $server_addr}}' \\ + "$JSON_PATH" > temp.json \\ + && mv temp.json "$JSON_PATH" \\ + && rm -f temp.json +""") + cluster_setup = "\n".join(cluster_setup) + env_exports = self._export_parallel_vars() + + + if self.params["venv"] == "singularity": + launcher = f"""singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\ +python3.10 -m vllm.entrypoints.openai.api_server \\ +""" + else: + launcher = f"""source {self.params['venv']}/bin/activate +python3 -m vllm.entrypoints.openai.api_server \\ +""" + + args = "\n".join(self._generate_shared_args()) + return preamble + cluster_setup + env_exports + launcher + args + + def write_to_log_dir(self) -> Path: + log_subdir = Path(self.params["log_dir"]) / self.params["model_name"] + log_subdir.mkdir(parents=True, exist_ok=True) + script_path = log_subdir / "launch.slurm" + content = self._generate_script_content() + script_path.write_text(content) + return script_path diff --git a/venv.sh b/venv.sh old mode 100644 new mode 100755 index a37eaade..82811638 --- a/venv.sh +++ b/venv.sh @@ -1,8 +1,8 @@ -#!bin/bash +#!/bin/bash # Load python module if you are on Vector cluster and install poetry module load python/3.10.12 -pip install poetry +pip3 install poetry # Optional: it's recommended to change the cache directory to somewhere in the scratch space to avoid # running out of space in your home directory, below is an example for the Vector cluster @@ -13,11 +13,14 @@ export POETRY_CACHE_DIR=/scratch/ssd004/scratch/$(whoami)/poetry_cache # poetry config cache-dir echo "Cache directory set to: $(poetry config cache-dir)" +echo "📜 Telling Poetry to use Python 3.10..." +poetry env use python3.10 + # Install dependencies via poetry poetry install # Activate the virtual environment -poetry shell +# poetry shell # Deactivate the virtual environment # deactivate From fdd02d5422571be8fc87e73d009388828b787e5f Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 4 Apr 2025 08:54:02 -0400 Subject: [PATCH 02/13] fixed vllm port number issue. --- vec_inf/cli/_slurm_script_generator.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vec_inf/cli/_slurm_script_generator.py b/vec_inf/cli/_slurm_script_generator.py index 5387ebbb..d031189a 100644 --- a/vec_inf/cli/_slurm_script_generator.py +++ b/vec_inf/cli/_slurm_script_generator.py @@ -122,14 +122,13 @@ def _generate_multinode_script(self) -> str: singularity exec $SINGULARITY_IMAGE ray stop """) - cluster_setup.append(f"""nodes=$(scontrol show hostnames "${{SLURM_JOB_NODELIST}}") -nodes_array=(${{nodes}}) + cluster_setup.append(f"""nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +nodes_array=($nodes) head_node=${{nodes_array[0]}} head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) head_node_port=$(find_available_port $head_node_ip 8080 65535) -vllm_port_number=$(find_available_port $head_node_ip 8080 65535) ip_head=$head_node_ip:$head_node_port export ip_head @@ -159,7 +158,11 @@ def _generate_multinode_script(self) -> str: sleep 5 done -SERVER_ADDR="http://$head_node_ip:$vllm_port_number/v1" + +vllm_port_number=$(find_available_port $head_node_ip 8080 65535) + + +SERVER_ADDR="http://${{head_node_ip}}:${{vllm_port_number}}/v1" echo "Server address: $SERVER_ADDR" JSON_PATH="{self.params['log_dir']}/{self.params['model_name']}.$SLURM_JOB_ID/{self.params['model_name']}.$SLURM_JOB_ID.json" From 85b756591949b56d2ee1165d19bd2d6702a07190 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 4 Apr 2025 13:09:06 +0000 Subject: [PATCH 03/13] [pre-commit.ci] Add auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- vec_inf/cli/_helper.py | 7 +-- vec_inf/cli/_slurm_script_generator.py | 61 +++++++++++++++----------- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py index 3a9efe31..73dd82b3 100644 --- a/vec_inf/cli/_helper.py +++ b/vec_inf/cli/_helper.py @@ -165,8 +165,6 @@ def build_launch_command(self) -> str: command_list.append(str(slurm_script_path)) return " ".join(command_list) - - def format_table_output(self, job_id: str) -> Table: """Format output as rich Table.""" @@ -199,7 +197,10 @@ def format_table_output(self, job_id: str) -> Table: ) if self.params.get("enforce_eager"): table.add_row("Enforce Eager", self.params["enforce_eager"]) - table.add_row("Model Weights Directory", str(Path(self.params["model_weights_parent_dir"], self.model_name))) + table.add_row( + "Model Weights Directory", + str(Path(self.params["model_weights_parent_dir"], self.model_name)), + ) table.add_row("Log Directory", self.params["log_dir"]) return table diff --git a/vec_inf/cli/_slurm_script_generator.py b/vec_inf/cli/_slurm_script_generator.py index d031189a..b0004887 100644 --- a/vec_inf/cli/_slurm_script_generator.py +++ b/vec_inf/cli/_slurm_script_generator.py @@ -1,5 +1,6 @@ from pathlib import Path + VLLM_TASK_MAP = { "LLM": "generate", "VLM": "generate", @@ -7,16 +8,23 @@ "Reward_Modeling": "reward", } + class SlurmScriptGenerator: def __init__(self, params: dict, src_dir: str, is_multinode: bool = False): self.params = params self.src_dir = src_dir self.is_multinode = is_multinode - self.model_weights_path = Path(params["model_weights_parent_dir"], params["model_name"]) + self.model_weights_path = Path( + params["model_weights_parent_dir"], params["model_name"] + ) self.task = VLLM_TASK_MAP[self.params["model_type"]] def _generate_script_content(self) -> str: - return self._generate_multinode_script() if self.is_multinode else self._generate_single_node_script() + return ( + self._generate_multinode_script() + if self.is_multinode + else self._generate_single_node_script() + ) def _generate_preamble(self, is_multinode: bool = False) -> str: base = [ @@ -42,14 +50,13 @@ def _export_parallel_vars(self) -> str: export TENSOR_PARALLEL_SIZE=$((SLURM_JOB_NUM_NODES*SLURM_GPUS_PER_NODE)) fi """ - else: - return "export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE\n" + return "export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE\n" def _generate_shared_args(self) -> list[str]: args = [ f"--model {self.model_weights_path} \\", f"--served-model-name {self.params['model_name']} \\", - "--host \"0.0.0.0\" \\", + '--host "0.0.0.0" \\', "--port $vllm_port_number \\", "--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \\", f"--dtype {self.params['data_type']} \\", @@ -64,7 +71,9 @@ def _generate_shared_args(self) -> list[str]: if self.is_multinode: args.insert(4, "--pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \\") if self.params.get("max_num_batched_tokens"): - args.append(f"--max-num-batched-tokens={self.params['max_num_batched_tokens']} \\") + args.append( + f"--max-num-batched-tokens={self.params['max_num_batched_tokens']} \\" + ) if self.params.get("enable_prefix_caching") == "True": args.append("--enable-prefix-caching \\") if self.params.get("enable_chunked_prefill") == "True": @@ -83,7 +92,7 @@ def _generate_single_node_script(self) -> str: SERVER_ADDR="http://${{hostname}}:${{vllm_port_number}}/v1" echo "Server address: $SERVER_ADDR" -JSON_PATH="{self.params['log_dir']}/{self.params['model_name']}.$SLURM_JOB_ID/{self.params['model_name']}.$SLURM_JOB_ID.json" +JSON_PATH="{self.params["log_dir"]}/{self.params["model_name"]}.$SLURM_JOB_ID/{self.params["model_name"]}.$SLURM_JOB_ID.json" echo "Updating server address in $JSON_PATH" jq --arg server_addr "$SERVER_ADDR" \\ '. + {{"server_address": $server_addr}}' \\ @@ -103,29 +112,28 @@ def _generate_single_node_script(self) -> str: python3.10 -m vllm.entrypoints.openai.api_server \\ """ else: - launcher = f"""source {self.params['venv']}/bin/activate + launcher = f"""source {self.params["venv"]}/bin/activate python3 -m vllm.entrypoints.openai.api_server \\ """ args = "\n".join(self._generate_shared_args()) return preamble + server + env_exports + launcher + args - - + def _generate_multinode_script(self) -> str: preamble = self._generate_preamble(is_multinode=True) cluster_setup = [] if self.params["venv"] == "singularity": - cluster_setup.append(f"""export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif + cluster_setup.append("""export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 module load singularity-ce/3.8.2 singularity exec $SINGULARITY_IMAGE ray stop """) - cluster_setup.append(f"""nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") + cluster_setup.append("""nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") nodes_array=($nodes) -head_node=${{nodes_array[0]}} +head_node=${nodes_array[0]} head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) head_node_port=$(find_available_port $head_node_ip 8080 65535) @@ -133,56 +141,59 @@ def _generate_multinode_script(self) -> str: ip_head=$head_node_ip:$head_node_port export ip_head echo "IP Head: $ip_head" - + echo "Starting HEAD at $head_node" srun --nodes=1 --ntasks=1 -w "$head_node" \\""") if self.params["venv"] == "singularity": - cluster_setup.append(f""" singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\""") + cluster_setup.append( + f""" singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\""" + ) - cluster_setup.append(f""" ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \\ - --num-cpus "${{SLURM_CPUS_PER_TASK}}" --num-gpus "${{SLURM_GPUS_PER_NODE}}" --block & + cluster_setup.append(""" ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \\ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & sleep 10 worker_num=$((SLURM_JOB_NUM_NODES - 1)) for ((i = 1; i <= worker_num; i++)); do - node_i=${{nodes_array[$i]}} + node_i=${nodes_array[$i]} echo "Starting WORKER $i at $node_i" srun --nodes=1 --ntasks=1 -w "$node_i" \\""") if self.params["venv"] == "singularity": - cluster_setup.append(f""" singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\""") + cluster_setup.append( + f""" singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\""" + ) cluster_setup.append(f""" ray start --address "$ip_head" \\ --num-cpus "${{SLURM_CPUS_PER_TASK}}" --num-gpus "${{SLURM_GPUS_PER_NODE}}" --block & sleep 5 done - + vllm_port_number=$(find_available_port $head_node_ip 8080 65535) - + SERVER_ADDR="http://${{head_node_ip}}:${{vllm_port_number}}/v1" echo "Server address: $SERVER_ADDR" -JSON_PATH="{self.params['log_dir']}/{self.params['model_name']}.$SLURM_JOB_ID/{self.params['model_name']}.$SLURM_JOB_ID.json" +JSON_PATH="{self.params["log_dir"]}/{self.params["model_name"]}.$SLURM_JOB_ID/{self.params["model_name"]}.$SLURM_JOB_ID.json" echo "Updating server address in $JSON_PATH" jq --arg server_addr "$SERVER_ADDR" \\ '. + {{"server_address": $server_addr}}' \\ "$JSON_PATH" > temp.json \\ && mv temp.json "$JSON_PATH" \\ - && rm -f temp.json + && rm -f temp.json """) cluster_setup = "\n".join(cluster_setup) env_exports = self._export_parallel_vars() - if self.params["venv"] == "singularity": launcher = f"""singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\ python3.10 -m vllm.entrypoints.openai.api_server \\ """ else: - launcher = f"""source {self.params['venv']}/bin/activate + launcher = f"""source {self.params["venv"]}/bin/activate python3 -m vllm.entrypoints.openai.api_server \\ """ From 8d88cb12716be410e8887dcf6fa1944cf09aa98c Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 4 Apr 2025 09:30:32 -0400 Subject: [PATCH 04/13] fixed precommit. --- vec_inf/cli/_helper.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py index 73dd82b3..228f28f5 100644 --- a/vec_inf/cli/_helper.py +++ b/vec_inf/cli/_helper.py @@ -153,11 +153,6 @@ def build_launch_command(self) -> str: f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.err", ] ) - # Add slurm script - # slurm_script = "vllm.slurm" - # if int(self.params["num_nodes"]) > 1: - # slurm_script = "multinode_vllm.slurm" - # command_list.append(f"{SRC_DIR}/{slurm_script}") slurm_script_path = SlurmScriptGenerator( self.params, src_dir=SRC_DIR, is_multinode=int(self.params["num_nodes"]) > 1 From 1d2a1aef5e70e30c1a49980414645779a419fa40 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 4 Apr 2025 12:13:39 -0400 Subject: [PATCH 05/13] refactored slurm generation. --- vec_inf/cli/_helper.py | 2 +- vec_inf/cli/_slurm_script_generator.py | 157 +++++++++++-------------- vec_inf/multinode_vllm.slurm | 154 ------------------------ vec_inf/vllm.slurm | 90 -------------- 4 files changed, 72 insertions(+), 331 deletions(-) delete mode 100644 vec_inf/multinode_vllm.slurm delete mode 100644 vec_inf/vllm.slurm diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py index 228f28f5..dcaf76ae 100644 --- a/vec_inf/cli/_helper.py +++ b/vec_inf/cli/_helper.py @@ -155,7 +155,7 @@ def build_launch_command(self) -> str: ) slurm_script_path = SlurmScriptGenerator( - self.params, src_dir=SRC_DIR, is_multinode=int(self.params["num_nodes"]) > 1 + self.params, src_dir=SRC_DIR ).write_to_log_dir() command_list.append(str(slurm_script_path)) diff --git a/vec_inf/cli/_slurm_script_generator.py b/vec_inf/cli/_slurm_script_generator.py index b0004887..baf44579 100644 --- a/vec_inf/cli/_slurm_script_generator.py +++ b/vec_inf/cli/_slurm_script_generator.py @@ -1,4 +1,6 @@ +from datetime import datetime from pathlib import Path +from typing import Any VLLM_TASK_MAP = { @@ -10,49 +12,51 @@ class SlurmScriptGenerator: - def __init__(self, params: dict, src_dir: str, is_multinode: bool = False): + def __init__(self, params: dict[str, Any], src_dir: str): self.params = params self.src_dir = src_dir - self.is_multinode = is_multinode - self.model_weights_path = Path( - params["model_weights_parent_dir"], params["model_name"] + self.is_multinode = int(self.params["num_nodes"]) > 1 + self.model_weights_path = str( + Path(params["model_weights_parent_dir"], params["model_name"]) ) self.task = VLLM_TASK_MAP[self.params["model_type"]] def _generate_script_content(self) -> str: - return ( - self._generate_multinode_script() - if self.is_multinode - else self._generate_single_node_script() - ) + preamble = self._generate_preamble() + server = self._generate_server_script() + env_exports = self._export_parallel_vars() + launcher = self._generate_launcher() + args = self._generate_shared_args() + return preamble + server + env_exports + launcher + args - def _generate_preamble(self, is_multinode: bool = False) -> str: + def _generate_preamble(self) -> str: base = [ "#!/bin/bash", "#SBATCH --cpus-per-task=16", "#SBATCH --mem=64G", ] - if is_multinode: + if self.is_multinode: base += [ "#SBATCH --exclusive", "#SBATCH --tasks-per-node=1", ] - base += [f"source {self.src_dir}/find_port.sh", ""] + base += [""] return "\n".join(base) def _export_parallel_vars(self) -> str: if self.is_multinode: return """if [ "$PIPELINE_PARALLELISM" = "True" ]; then -export PIPELINE_PARALLEL_SIZE=$SLURM_JOB_NUM_NODES -export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE + export PIPELINE_PARALLEL_SIZE=$SLURM_JOB_NUM_NODES + export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE else -export PIPELINE_PARALLEL_SIZE=1 -export TENSOR_PARALLEL_SIZE=$((SLURM_JOB_NUM_NODES*SLURM_GPUS_PER_NODE)) + export PIPELINE_PARALLEL_SIZE=1 + export TENSOR_PARALLEL_SIZE=$((SLURM_JOB_NUM_NODES*SLURM_GPUS_PER_NODE)) fi + """ - return "export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE\n" + return "export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE\n\n" - def _generate_shared_args(self) -> list[str]: + def _generate_shared_args(self) -> str: args = [ f"--model {self.model_weights_path} \\", f"--served-model-name {self.params['model_name']} \\", @@ -81,56 +85,44 @@ def _generate_shared_args(self) -> list[str]: if self.params.get("enforce_eager") == "True": args.append("--enforce-eager") - return args - - def _generate_single_node_script(self) -> str: - preamble = self._generate_preamble(is_multinode=False) - - server = f"""hostname=${{SLURMD_NODENAME}} -vllm_port_number=$(find_available_port ${{hostname}} 8080 65535) - -SERVER_ADDR="http://${{hostname}}:${{vllm_port_number}}/v1" -echo "Server address: $SERVER_ADDR" + return "\n".join(args) + def _generate_server_script(self) -> str: + server_script = [""] + if self.params["venv"] == "singularity": + server_script.append("""export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif +export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 +module load singularity-ce/3.8.2 +singularity exec $SINGULARITY_IMAGE ray stop +""") + server_script.append(f"source {self.src_dir}/find_port.sh\n") + server_script.append( + self._generate_multinode_server_script() + if self.is_multinode + else self._generate_single_node_server_script() + ) + server_script.append(f"""echo "Updating server address in $JSON_PATH" JSON_PATH="{self.params["log_dir"]}/{self.params["model_name"]}.$SLURM_JOB_ID/{self.params["model_name"]}.$SLURM_JOB_ID.json" -echo "Updating server address in $JSON_PATH" jq --arg server_addr "$SERVER_ADDR" \\ '. + {{"server_address": $server_addr}}' \\ "$JSON_PATH" > temp.json \\ && mv temp.json "$JSON_PATH" \\ && rm -f temp.json -""" - env_exports = self._export_parallel_vars() - - if self.params["venv"] == "singularity": - launcher = f"""export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif -export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 -module load singularity-ce/3.8.2 -singularity exec $SINGULARITY_IMAGE ray stop -singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\ -python3.10 -m vllm.entrypoints.openai.api_server \\ -""" - else: - launcher = f"""source {self.params["venv"]}/bin/activate -python3 -m vllm.entrypoints.openai.api_server \\ -""" - - args = "\n".join(self._generate_shared_args()) - return preamble + server + env_exports + launcher + args +""") + return "\n".join(server_script) - def _generate_multinode_script(self) -> str: - preamble = self._generate_preamble(is_multinode=True) + def _generate_single_node_server_script(self) -> str: + return """hostname=${SLURMD_NODENAME} +vllm_port_number=$(find_available_port ${hostname} 8080 65535) - cluster_setup = [] - if self.params["venv"] == "singularity": - cluster_setup.append("""export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif -export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 -module load singularity-ce/3.8.2 -singularity exec $SINGULARITY_IMAGE ray stop -""") +SERVER_ADDR="http://${hostname}:${vllm_port_number}/v1" +echo "Server address: $SERVER_ADDR" +""" - cluster_setup.append("""nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") + def _generate_multinode_server_script(self) -> str: + server_script = [] + server_script.append("""nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") nodes_array=($nodes) head_node=${nodes_array[0]} @@ -146,11 +138,11 @@ def _generate_multinode_script(self) -> str: srun --nodes=1 --ntasks=1 -w "$head_node" \\""") if self.params["venv"] == "singularity": - cluster_setup.append( - f""" singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\""" + server_script.append( + f" singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\" ) - cluster_setup.append(""" ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \\ + server_script.append(""" ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \\ --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & sleep 10 @@ -162,48 +154,41 @@ def _generate_multinode_script(self) -> str: srun --nodes=1 --ntasks=1 -w "$node_i" \\""") if self.params["venv"] == "singularity": - cluster_setup.append( + server_script.append( f""" singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\""" ) - cluster_setup.append(f""" ray start --address "$ip_head" \\ - --num-cpus "${{SLURM_CPUS_PER_TASK}}" --num-gpus "${{SLURM_GPUS_PER_NODE}}" --block & + server_script.append(""" ray start --address "$ip_head" \\ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & sleep 5 done - vllm_port_number=$(find_available_port $head_node_ip 8080 65535) - -SERVER_ADDR="http://${{head_node_ip}}:${{vllm_port_number}}/v1" +SERVER_ADDR="http://${head_node_ip}:${vllm_port_number}/v1" echo "Server address: $SERVER_ADDR" -JSON_PATH="{self.params["log_dir"]}/{self.params["model_name"]}.$SLURM_JOB_ID/{self.params["model_name"]}.$SLURM_JOB_ID.json" -echo "Updating server address in $JSON_PATH" -jq --arg server_addr "$SERVER_ADDR" \\ - '. + {{"server_address": $server_addr}}' \\ - "$JSON_PATH" > temp.json \\ - && mv temp.json "$JSON_PATH" \\ - && rm -f temp.json """) - cluster_setup = "\n".join(cluster_setup) - env_exports = self._export_parallel_vars() + return "\n".join(server_script) + def _generate_launcher(self) -> str: if self.params["venv"] == "singularity": - launcher = f"""singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\ -python3.10 -m vllm.entrypoints.openai.api_server \\ -""" + launcher_script = [ + f"""singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\""" + ] else: - launcher = f"""source {self.params["venv"]}/bin/activate -python3 -m vllm.entrypoints.openai.api_server \\ -""" - - args = "\n".join(self._generate_shared_args()) - return preamble + cluster_setup + env_exports + launcher + args + launcher_script = [f"""source {self.params["venv"]}/bin/activate"""] + launcher_script.append( + """python3.10 -m vllm.entrypoints.openai.api_server \\\n""" + ) + return "\n".join(launcher_script) def write_to_log_dir(self) -> Path: - log_subdir = Path(self.params["log_dir"]) / self.params["model_name"] + log_subdir: Path = Path(self.params["log_dir"]) / self.params["model_name"] log_subdir.mkdir(parents=True, exist_ok=True) - script_path = log_subdir / "launch.slurm" + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + script_path: Path = log_subdir / f"launch_{timestamp}.slurm" + content = self._generate_script_content() script_path.write_text(content) return script_path diff --git a/vec_inf/multinode_vllm.slurm b/vec_inf/multinode_vllm.slurm deleted file mode 100644 index 544ac136..00000000 --- a/vec_inf/multinode_vllm.slurm +++ /dev/null @@ -1,154 +0,0 @@ -#!/bin/bash -#SBATCH --cpus-per-task=16 -#SBATCH --mem=64G -#SBATCH --exclusive -#SBATCH --tasks-per-node=1 - -source ${SRC_DIR}/find_port.sh - -if [ "$VENV_BASE" = "singularity" ]; then - export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif - export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 - module load singularity-ce/3.8.2 - singularity exec $SINGULARITY_IMAGE ray stop -fi - -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} -head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) - -# Find port for head node -head_node_port=$(find_available_port $head_node_ip 8080 65535) - -# Starting the Ray head node -ip_head=$head_node_ip:$head_node_port -export ip_head -echo "IP Head: $ip_head" - -echo "Starting HEAD at $head_node" -if [ "$VENV_BASE" = "singularity" ]; then - srun --nodes=1 --ntasks=1 -w "$head_node" \ - singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \ - ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & -else - srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & -fi - -# Starting the Ray worker nodes -# Optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - if [ "$VENV_BASE" = "singularity" ]; then - srun --nodes=1 --ntasks=1 -w "$node_i" \ - singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \ - ray start --address "$ip_head" \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & - else - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$ip_head" \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & - fi - - sleep 5 -done - - -vllm_port_number=$(find_available_port $head_node_ip 8080 65535) - -SERVER_ADDR="http://${head_node_ip}:${vllm_port_number}/v1" -echo "Server address: $SERVER_ADDR" - -jq --arg server_addr "$SERVER_ADDR" \ - '. + {"server_address": $server_addr}' \ - "$LOG_DIR/$MODEL_NAME.$SLURM_JOB_ID/$MODEL_NAME.$SLURM_JOB_ID.json" > temp.json \ - && mv temp.json "$LOG_DIR/$MODEL_NAME.$SLURM_JOB_ID/$MODEL_NAME.$SLURM_JOB_ID.json" \ - && rm temp.json - -if [ "$PIPELINE_PARALLELISM" = "True" ]; then - export PIPELINE_PARALLEL_SIZE=$SLURM_JOB_NUM_NODES - export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE -else - export PIPELINE_PARALLEL_SIZE=1 - export TENSOR_PARALLEL_SIZE=$((SLURM_JOB_NUM_NODES*SLURM_GPUS_PER_NODE)) -fi - -if [ "$ENFORCE_EAGER" = "True" ]; then - export ENFORCE_EAGER="--enforce-eager" -else - export ENFORCE_EAGER="" -fi - -if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then - export ENABLE_PREFIX_CACHING="--enable-prefix-caching" -else - export ENABLE_PREFIX_CACHING="" -fi - -if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then - export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill" -else - export ENABLE_CHUNKED_PREFILL="" -fi - -if [ -z "$MAX_NUM_BATCHED_TOKENS" ]; then - export MAX_NUM_BATCHED_TOKENS="" -else - export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS" -fi - -# Activate vllm venv -if [ "$VENV_BASE" = "singularity" ]; then - singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \ - python3.10 -m vllm.entrypoints.openai.api_server \ - --model ${MODEL_WEIGHTS} \ - --served-model-name ${MODEL_NAME} \ - --host "0.0.0.0" \ - --port ${vllm_port_number} \ - --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \ - --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ - --dtype ${DATA_TYPE} \ - --trust-remote-code \ - --max-logprobs ${MAX_LOGPROBS} \ - --max-model-len ${MAX_MODEL_LEN} \ - --max-num-seqs ${MAX_NUM_SEQS} \ - --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ - --compilation-config ${COMPILATION_CONFIG} \ - --task ${TASK} \ - ${MAX_NUM_BATCHED_TOKENS} \ - ${ENABLE_PREFIX_CACHING} \ - ${ENABLE_CHUNKED_PREFILL} \ - ${ENFORCE_EAGER} -else - source ${VENV_BASE}/bin/activate - python3 -m vllm.entrypoints.openai.api_server \ - --model ${MODEL_WEIGHTS} \ - --served-model-name ${MODEL_NAME} \ - --host "0.0.0.0" \ - --port ${vllm_port_number} \ - --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \ - --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ - --dtype ${DATA_TYPE} \ - --trust-remote-code \ - --max-logprobs ${MAX_LOGPROBS} \ - --max-model-len ${MAX_MODEL_LEN} \ - --max-num-seqs ${MAX_NUM_SEQS} \ - --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ - --compilation-config ${COMPILATION_CONFIG} \ - --task ${TASK} \ - ${MAX_NUM_BATCHED_TOKENS} \ - ${ENABLE_PREFIX_CACHING} \ - ${ENABLE_CHUNKED_PREFILL} \ - ${ENFORCE_EAGER} -fi diff --git a/vec_inf/vllm.slurm b/vec_inf/vllm.slurm deleted file mode 100644 index e9729d10..00000000 --- a/vec_inf/vllm.slurm +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash -#SBATCH --cpus-per-task=16 -#SBATCH --mem=64G - -source ${SRC_DIR}/find_port.sh - -# Write server url to file -hostname=${SLURMD_NODENAME} -vllm_port_number=$(find_available_port $hostname 8080 65535) - -SERVER_ADDR="http://${hostname}:${vllm_port_number}/v1" -echo "Server address: $SERVER_ADDR" - -jq --arg server_addr "$SERVER_ADDR" \ - '. + {"server_address": $server_addr}' \ - "$LOG_DIR/$MODEL_NAME.$SLURM_JOB_ID/$MODEL_NAME.$SLURM_JOB_ID.json" > temp.json \ - && mv temp.json "$LOG_DIR/$MODEL_NAME.$SLURM_JOB_ID/$MODEL_NAME.$SLURM_JOB_ID.json" \ - && rm temp.json - -if [ "$ENFORCE_EAGER" = "True" ]; then - export ENFORCE_EAGER="--enforce-eager" -else - export ENFORCE_EAGER="" -fi - -if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then - export ENABLE_PREFIX_CACHING="--enable-prefix-caching" -else - export ENABLE_PREFIX_CACHING="" -fi - -if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then - export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill" -else - export ENABLE_CHUNKED_PREFILL="" -fi - -if [ -z "$MAX_NUM_BATCHED_TOKENS" ]; then - export MAX_NUM_BATCHED_TOKENS="" -else - export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS" -fi - -# Activate vllm venv -if [ "$VENV_BASE" = "singularity" ]; then - export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif - export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 - module load singularity-ce/3.8.2 - singularity exec $SINGULARITY_IMAGE ray stop - singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \ - python3.10 -m vllm.entrypoints.openai.api_server \ - --model ${MODEL_WEIGHTS} \ - --served-model-name ${MODEL_NAME} \ - --host "0.0.0.0" \ - --port ${vllm_port_number} \ - --tensor-parallel-size ${SLURM_GPUS_PER_NODE} \ - --dtype ${DATA_TYPE} \ - --max-logprobs ${MAX_LOGPROBS} \ - --trust-remote-code \ - --max-model-len ${MAX_MODEL_LEN} \ - --max-num-seqs ${MAX_NUM_SEQS} \ - --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ - --compilation-config ${COMPILATION_CONFIG} \ - --task ${TASK} \ - ${MAX_NUM_BATCHED_TOKENS} \ - ${ENABLE_PREFIX_CACHING} \ - ${ENABLE_CHUNKED_PREFILL} \ - ${ENFORCE_EAGER} - -else - source ${VENV_BASE}/bin/activate - python3 -m vllm.entrypoints.openai.api_server \ - --model ${MODEL_WEIGHTS} \ - --served-model-name ${MODEL_NAME} \ - --host "0.0.0.0" \ - --port ${vllm_port_number} \ - --tensor-parallel-size ${SLURM_GPUS_PER_NODE} \ - --dtype ${DATA_TYPE} \ - --max-logprobs ${MAX_LOGPROBS} \ - --trust-remote-code \ - --max-model-len ${MAX_MODEL_LEN} \ - --max-num-seqs ${MAX_NUM_SEQS} \ - --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ - --compilation-config ${COMPILATION_CONFIG} \ - --task ${TASK} \ - ${MAX_NUM_BATCHED_TOKENS} \ - ${ENABLE_PREFIX_CACHING} \ - ${ENABLE_CHUNKED_PREFILL} \ - ${ENFORCE_EAGER} -fi From 86c1c3ffc38d4603547adf7c1079047b25698f51 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Tue, 8 Apr 2025 14:05:18 -0400 Subject: [PATCH 06/13] removed export vars. --- vec_inf/cli/_slurm_script_generator.py | 27 ++++++++++---------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/vec_inf/cli/_slurm_script_generator.py b/vec_inf/cli/_slurm_script_generator.py index baf44579..cf488b64 100644 --- a/vec_inf/cli/_slurm_script_generator.py +++ b/vec_inf/cli/_slurm_script_generator.py @@ -24,10 +24,9 @@ def __init__(self, params: dict[str, Any], src_dir: str): def _generate_script_content(self) -> str: preamble = self._generate_preamble() server = self._generate_server_script() - env_exports = self._export_parallel_vars() launcher = self._generate_launcher() args = self._generate_shared_args() - return preamble + server + env_exports + launcher + args + return preamble + server + launcher + args def _generate_preamble(self) -> str: base = [ @@ -43,26 +42,20 @@ def _generate_preamble(self) -> str: base += [""] return "\n".join(base) - def _export_parallel_vars(self) -> str: - if self.is_multinode: - return """if [ "$PIPELINE_PARALLELISM" = "True" ]; then - export PIPELINE_PARALLEL_SIZE=$SLURM_JOB_NUM_NODES - export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE -else - export PIPELINE_PARALLEL_SIZE=1 - export TENSOR_PARALLEL_SIZE=$((SLURM_JOB_NUM_NODES*SLURM_GPUS_PER_NODE)) -fi - -""" - return "export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE\n\n" - def _generate_shared_args(self) -> str: + if self.is_multinode and not self.params["pipeline_parallelism"]: + tensor_parallel_size = "$((SLURM_JOB_NUM_NODES*SLURM_GPUS_PER_NODE))" + pipeline_parallel_size = "1" + else: + tensor_parallel_size = "$SLURM_GPUS_PER_NODE" + pipeline_parallel_size = "$SLURM_JOB_NUM_NODES" + args = [ f"--model {self.model_weights_path} \\", f"--served-model-name {self.params['model_name']} \\", '--host "0.0.0.0" \\', "--port $vllm_port_number \\", - "--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \\", + f"--tensor-parallel-size {tensor_parallel_size} \\", f"--dtype {self.params['data_type']} \\", "--trust-remote-code \\", f"--max-logprobs {self.params['vocab_size']} \\", @@ -73,7 +66,7 @@ def _generate_shared_args(self) -> str: f"--task {self.task} \\", ] if self.is_multinode: - args.insert(4, "--pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \\") + args.insert(4, f"--pipeline-parallel-size {pipeline_parallel_size} \\") if self.params.get("max_num_batched_tokens"): args.append( f"--max-num-batched-tokens={self.params['max_num_batched_tokens']} \\" From d100a001df0afb692429bf9bf52fdb3303d858ca Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Tue, 8 Apr 2025 14:05:42 -0400 Subject: [PATCH 07/13] Revert venv.sh and pyproject.toml to match main --- pyproject.toml | 2 +- venv.sh | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) mode change 100755 => 100644 venv.sh diff --git a/pyproject.toml b/pyproject.toml index 1dab6bee..def192fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "Efficient LLM inference on Slurm clusters using vLLM." readme = "README.md" authors = [{name = "Marshall Wang", email = "marshall.wang@vectorinstitute.ai"}] license = "MIT" -requires-python = ">=3.10,<4.0" +requires-python = ">=3.10" dependencies = [ "requests>=2.31.0", "click>=8.1.0", diff --git a/venv.sh b/venv.sh old mode 100755 new mode 100644 index 82811638..a37eaade --- a/venv.sh +++ b/venv.sh @@ -1,8 +1,8 @@ -#!/bin/bash +#!bin/bash # Load python module if you are on Vector cluster and install poetry module load python/3.10.12 -pip3 install poetry +pip install poetry # Optional: it's recommended to change the cache directory to somewhere in the scratch space to avoid # running out of space in your home directory, below is an example for the Vector cluster @@ -13,14 +13,11 @@ export POETRY_CACHE_DIR=/scratch/ssd004/scratch/$(whoami)/poetry_cache # poetry config cache-dir echo "Cache directory set to: $(poetry config cache-dir)" -echo "📜 Telling Poetry to use Python 3.10..." -poetry env use python3.10 - # Install dependencies via poetry poetry install # Activate the virtual environment -# poetry shell +poetry shell # Deactivate the virtual environment # deactivate From 112339b1e8379c87f021307e1437bafcfbc7532b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 15:09:16 +0000 Subject: [PATCH 08/13] [pre-commit.ci] Add auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- vec_inf/cli/_helper.py | 1 - vec_inf/client/_helper.py | 1 - 2 files changed, 2 deletions(-) diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py index 6c7db4cb..bb3d8a37 100644 --- a/vec_inf/cli/_helper.py +++ b/vec_inf/cli/_helper.py @@ -1,6 +1,5 @@ """Helper classes for the CLI.""" -import os from typing import Any, Union import click diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py index 7b7f0e8d..4e84e8b4 100644 --- a/vec_inf/client/_helper.py +++ b/vec_inf/client/_helper.py @@ -31,7 +31,6 @@ LD_LIBRARY_PATH, REQUIRED_FIELDS, SRC_DIR, - VLLM_TASK_MAP, ) From 96955825dd54d052bdb953b93179041d39fb0a97 Mon Sep 17 00:00:00 2001 From: Marshall Wang Date: Thu, 10 Apr 2025 11:17:24 -0400 Subject: [PATCH 09/13] Add missing import --- vec_inf/cli/_helper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py index bb3d8a37..af84fa21 100644 --- a/vec_inf/cli/_helper.py +++ b/vec_inf/cli/_helper.py @@ -1,5 +1,6 @@ """Helper classes for the CLI.""" +from pathlib import Path from typing import Any, Union import click From 3742e122ccbc621e201192aff66914d91fc5ab01 Mon Sep 17 00:00:00 2001 From: Marshall Wang Date: Thu, 10 Apr 2025 14:23:25 -0400 Subject: [PATCH 10/13] Move generated slurm script to job log directory post launch --- tests/vec_inf/cli/test_cli.py | 64 +++++++++-------------- vec_inf/client/_helper.py | 22 +++++--- vec_inf/client/_slurm_script_generator.py | 8 +-- 3 files changed, 45 insertions(+), 49 deletions(-) diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py index e249c636..c0f24300 100644 --- a/tests/vec_inf/cli/test_cli.py +++ b/tests/vec_inf/cli/test_cli.py @@ -226,13 +226,12 @@ def base_patches(test_paths, mock_truediv, debug_helper): "pathlib.Path.parent", return_value=debug_helper.config_file.parent.parent ), patch("pathlib.Path.__truediv__", side_effect=mock_truediv), - patch("pathlib.Path.iterdir", return_value=[]), # Mock empty directory listing + patch("pathlib.Path.iterdir", return_value=[]), patch("json.dump"), patch("pathlib.Path.touch"), patch("vec_inf.client._utils.Path", return_value=test_paths["weights_dir"]), - patch( - "pathlib.Path.home", return_value=Path("/home/user") - ), # Mock home directory + patch("pathlib.Path.home", return_value=Path("/home/user")), + patch("pathlib.Path.rename"), ] @@ -246,25 +245,19 @@ def apply_base_patches(base_patches): yield -def test_launch_command_success(runner, mock_launch_output, path_exists, debug_helper): +def test_launch_command_success( + runner, mock_launch_output, path_exists, debug_helper, mock_truediv, test_paths, base_patches +): """Test successful model launch with minimal required arguments.""" - test_log_dir = Path("/tmp/test_vec_inf_logs") + with ExitStack() as stack: + # Apply all base patches + for patch_obj in base_patches: + stack.enter_context(patch_obj) + + # Apply specific patches for this test + mock_run = stack.enter_context(patch("vec_inf.client._utils.run_bash_command")) + stack.enter_context(patch("pathlib.Path.exists", new=path_exists)) - with ( - patch("vec_inf.client._utils.run_bash_command") as mock_run, - patch("pathlib.Path.mkdir"), - patch("builtins.open", debug_helper.tracked_mock_open), - patch("pathlib.Path.open", debug_helper.tracked_mock_open), - patch("pathlib.Path.exists", new=path_exists), - patch("pathlib.Path.expanduser", return_value=test_log_dir), - patch("pathlib.Path.resolve", return_value=debug_helper.config_file.parent), - patch( - "pathlib.Path.parent", return_value=debug_helper.config_file.parent.parent - ), - patch("json.dump"), - patch("pathlib.Path.touch"), - patch("pathlib.Path.__truediv__", return_value=test_log_dir), - ): expected_job_id = "14933053" mock_run.return_value = mock_launch_output(expected_job_id) @@ -277,25 +270,18 @@ def test_launch_command_success(runner, mock_launch_output, path_exists, debug_h def test_launch_command_with_json_output( - runner, mock_launch_output, path_exists, debug_helper + runner, mock_launch_output, path_exists, debug_helper, mock_truediv, test_paths, base_patches ): """Test JSON output format for launch command.""" - test_log_dir = Path("/tmp/test_vec_inf_logs") - with ( - patch("vec_inf.client._utils.run_bash_command") as mock_run, - patch("pathlib.Path.mkdir"), - patch("builtins.open", debug_helper.tracked_mock_open), - patch("pathlib.Path.open", debug_helper.tracked_mock_open), - patch("pathlib.Path.exists", new=path_exists), - patch("pathlib.Path.expanduser", return_value=test_log_dir), - patch("pathlib.Path.resolve", return_value=debug_helper.config_file.parent), - patch( - "pathlib.Path.parent", return_value=debug_helper.config_file.parent.parent - ), - patch("json.dump"), - patch("pathlib.Path.touch"), - patch("pathlib.Path.__truediv__", return_value=test_log_dir), - ): + with ExitStack() as stack: + # Apply all base patches + for patch_obj in base_patches: + stack.enter_context(patch_obj) + + # Apply specific patches for this test + mock_run = stack.enter_context(patch("vec_inf.client._utils.run_bash_command")) + stack.enter_context(patch("pathlib.Path.exists", new=path_exists)) + expected_job_id = "14933051" mock_run.return_value = mock_launch_output(expected_job_id) @@ -319,7 +305,7 @@ def test_launch_command_with_json_output( assert output.get("slurm_job_id") == expected_job_id assert output.get("model_name") == "Meta-Llama-3.1-8B" assert output.get("model_type") == "LLM" - assert str(test_log_dir) in output.get("log_dir", "") + assert str(test_paths["log_dir"]) in output.get("log_dir", "") def test_launch_command_no_model_weights_parent_dir(runner, debug_helper, base_patches): diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py index 4e84e8b4..1883c0d7 100644 --- a/vec_inf/client/_helper.py +++ b/vec_inf/client/_helper.py @@ -50,6 +50,7 @@ def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]): self.model_name = model_name self.kwargs = kwargs or {} self.slurm_job_id = "" + self.slurm_script_path = Path("") self.model_config = self._get_model_configuration() self.params = self._get_launch_params() @@ -163,8 +164,10 @@ def _build_launch_command(self) -> str: ] ) # Add slurm script - slurm_script = SlurmScriptGenerator(self.params, SRC_DIR).write_to_log_dir() - command_list.append(str(slurm_script)) + self.slurm_script_path = SlurmScriptGenerator( + self.params, SRC_DIR + ).write_to_log_dir() + command_list.append(str(self.slurm_script_path)) return " ".join(command_list) def launch(self) -> LaunchResponse: @@ -181,15 +184,22 @@ def launch(self) -> LaunchResponse: self.slurm_job_id = command_output.split(" ")[-1].strip().strip("\n") self.params["slurm_job_id"] = self.slurm_job_id - # Create log directory and job json file + # Create log directory and job json file, move slurm script to job log directory + job_log_dir = Path( + self.params["log_dir"], f"{self.model_name}.{self.slurm_job_id}" + ) + job_log_dir.mkdir(parents=True, exist_ok=True) + job_json = Path( - self.params["log_dir"], - f"{self.model_name}.{self.slurm_job_id}", + job_log_dir, f"{self.model_name}.{self.slurm_job_id}.json", ) - job_json.parent.mkdir(parents=True, exist_ok=True) job_json.touch(exist_ok=True) + self.slurm_script_path.rename( + job_log_dir / f"{self.model_name}.{self.slurm_job_id}.slurm" + ) + with job_json.open("w") as file: json.dump(self.params, file, indent=4) diff --git a/vec_inf/client/_slurm_script_generator.py b/vec_inf/client/_slurm_script_generator.py index 66ffef83..0eca7b2d 100644 --- a/vec_inf/client/_slurm_script_generator.py +++ b/vec_inf/client/_slurm_script_generator.py @@ -170,11 +170,11 @@ def _generate_launcher(self) -> str: return "\n".join(launcher_script) def write_to_log_dir(self) -> Path: - log_subdir: Path = Path(self.params["log_dir"]) / self.params["model_name"] - log_subdir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - script_path: Path = log_subdir / f"launch_{timestamp}.slurm" + script_path: Path = ( + Path(self.params["log_dir"]) + / f"launch_{self.params['model_name']}_{timestamp}.slurm" + ) content = self._generate_script_content() script_path.write_text(content) From f6273a05c2aa266f166a6190b0879e2e8167426c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 18:25:16 +0000 Subject: [PATCH 11/13] [pre-commit.ci] Add auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/vec_inf/cli/test_cli.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py index c0f24300..155b0913 100644 --- a/tests/vec_inf/cli/test_cli.py +++ b/tests/vec_inf/cli/test_cli.py @@ -246,14 +246,20 @@ def apply_base_patches(base_patches): def test_launch_command_success( - runner, mock_launch_output, path_exists, debug_helper, mock_truediv, test_paths, base_patches + runner, + mock_launch_output, + path_exists, + debug_helper, + mock_truediv, + test_paths, + base_patches, ): """Test successful model launch with minimal required arguments.""" with ExitStack() as stack: # Apply all base patches for patch_obj in base_patches: stack.enter_context(patch_obj) - + # Apply specific patches for this test mock_run = stack.enter_context(patch("vec_inf.client._utils.run_bash_command")) stack.enter_context(patch("pathlib.Path.exists", new=path_exists)) @@ -270,14 +276,20 @@ def test_launch_command_success( def test_launch_command_with_json_output( - runner, mock_launch_output, path_exists, debug_helper, mock_truediv, test_paths, base_patches + runner, + mock_launch_output, + path_exists, + debug_helper, + mock_truediv, + test_paths, + base_patches, ): """Test JSON output format for launch command.""" with ExitStack() as stack: # Apply all base patches for patch_obj in base_patches: stack.enter_context(patch_obj) - + # Apply specific patches for this test mock_run = stack.enter_context(patch("vec_inf.client._utils.run_bash_command")) stack.enter_context(patch("pathlib.Path.exists", new=path_exists)) From 348a22e44db8af00f93f805c64f9140f6029bde4 Mon Sep 17 00:00:00 2001 From: Marshall Wang Date: Thu, 10 Apr 2025 18:01:36 -0400 Subject: [PATCH 12/13] Move all environment var declaration to python, remove unnecessary env var usage in generated slurm script, change non env var name to lower case --- vec_inf/client/_helper.py | 4 +++ vec_inf/client/_slurm_script_generator.py | 30 ++++++++++------------- vec_inf/client/_vars.py | 2 ++ 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py index 1883c0d7..0fa5c4c2 100644 --- a/vec_inf/client/_helper.py +++ b/vec_inf/client/_helper.py @@ -30,7 +30,9 @@ BOOLEAN_FIELDS, LD_LIBRARY_PATH, REQUIRED_FIELDS, + SINGULARITY_IMAGE, SRC_DIR, + VLLM_NCCL_SO_PATH, ) @@ -139,6 +141,8 @@ def _get_launch_params(self) -> dict[str, Any]: def _set_env_vars(self) -> None: """Set environment variables for the launch command.""" os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH + os.environ["VLLM_NCCL_SO_PATH"] = VLLM_NCCL_SO_PATH + os.environ["SINGULARITY_IMAGE"] = SINGULARITY_IMAGE def _build_launch_command(self) -> str: """Construct the full launch command with parameters.""" diff --git a/vec_inf/client/_slurm_script_generator.py b/vec_inf/client/_slurm_script_generator.py index 0eca7b2d..c72b8812 100644 --- a/vec_inf/client/_slurm_script_generator.py +++ b/vec_inf/client/_slurm_script_generator.py @@ -38,11 +38,11 @@ def _generate_preamble(self) -> str: def _generate_shared_args(self) -> str: if self.is_multinode and not self.params["pipeline_parallelism"]: - tensor_parallel_size = "$((SLURM_JOB_NUM_NODES*SLURM_GPUS_PER_NODE))" - pipeline_parallel_size = "1" + tensor_parallel_size = self.params["num_nodes"] * self.params["gpus_per_node"] + pipeline_parallel_size = 1 else: - tensor_parallel_size = "$SLURM_GPUS_PER_NODE" - pipeline_parallel_size = "$SLURM_JOB_NUM_NODES" + tensor_parallel_size = self.params["gpus_per_node"] + pipeline_parallel_size = self.params["num_nodes"] args = [ f"--model {self.model_weights_path} \\", @@ -77,9 +77,7 @@ def _generate_shared_args(self) -> str: def _generate_server_script(self) -> str: server_script = [""] if self.params["venv"] == "singularity": - server_script.append("""export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif -export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 -module load singularity-ce/3.8.2 + server_script.append("""module load singularity-ce/3.8.2 singularity exec $SINGULARITY_IMAGE ray stop """) server_script.append(f"source {self.src_dir}/find_port.sh\n") @@ -88,13 +86,11 @@ def _generate_server_script(self) -> str: if self.is_multinode else self._generate_single_node_server_script() ) - server_script.append(f"""echo "Updating server address in $JSON_PATH" -JSON_PATH="{self.params["log_dir"]}/{self.params["model_name"]}.$SLURM_JOB_ID/{self.params["model_name"]}.$SLURM_JOB_ID.json" -jq --arg server_addr "$SERVER_ADDR" \\ + server_script.append(f"""json_path="{self.params["log_dir"]}/{self.params["model_name"]}.$SLURM_JOB_ID/{self.params["model_name"]}.$SLURM_JOB_ID.json" +jq --arg server_addr "$server_address" \\ '. + {{"server_address": $server_addr}}' \\ - "$JSON_PATH" > temp.json \\ - && mv temp.json "$JSON_PATH" \\ - && rm -f temp.json + "$json_path" > temp.json \\ + && mv temp.json "$json_path" """) return "\n".join(server_script) @@ -103,8 +99,8 @@ def _generate_single_node_server_script(self) -> str: return """hostname=${SLURMD_NODENAME} vllm_port_number=$(find_available_port ${hostname} 8080 65535) -SERVER_ADDR="http://${hostname}:${vllm_port_number}/v1" -echo "Server address: $SERVER_ADDR" +server_address="http://${hostname}:${vllm_port_number}/v1" +echo "Server address: $server_address" """ def _generate_multinode_server_script(self) -> str: @@ -151,8 +147,8 @@ def _generate_multinode_server_script(self) -> str: vllm_port_number=$(find_available_port $head_node_ip 8080 65535) -SERVER_ADDR="http://${head_node_ip}:${vllm_port_number}/v1" -echo "Server address: $SERVER_ADDR" +server_address="http://${head_node_ip}:${vllm_port_number}/v1" +echo "Server address: $server_address" """) return "\n".join(server_script) diff --git a/vec_inf/client/_vars.py b/vec_inf/client/_vars.py index 71e9e221..b8e5dca6 100644 --- a/vec_inf/client/_vars.py +++ b/vec_inf/client/_vars.py @@ -7,6 +7,8 @@ CACHED_CONFIG = Path("/", "model-weights", "vec-inf-shared", "models.yaml") SRC_DIR = str(Path(__file__).parent.parent) LD_LIBRARY_PATH = "/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/" +VLLM_NCCL_SO_PATH = "/vec-inf/nccl/libnccl.so.2.18.1" +SINGULARITY_IMAGE = "/model-weights/vec-inf-shared/vector-inference_latest.sif" # Maps model types to vLLM tasks VLLM_TASK_MAP = { From a6a090c2a9d13e1c14aa554c550ffe7beeeacb3e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 22:01:56 +0000 Subject: [PATCH 13/13] [pre-commit.ci] Add auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- vec_inf/client/_slurm_script_generator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vec_inf/client/_slurm_script_generator.py b/vec_inf/client/_slurm_script_generator.py index c72b8812..3daabe00 100644 --- a/vec_inf/client/_slurm_script_generator.py +++ b/vec_inf/client/_slurm_script_generator.py @@ -38,7 +38,9 @@ def _generate_preamble(self) -> str: def _generate_shared_args(self) -> str: if self.is_multinode and not self.params["pipeline_parallelism"]: - tensor_parallel_size = self.params["num_nodes"] * self.params["gpus_per_node"] + tensor_parallel_size = ( + self.params["num_nodes"] * self.params["gpus_per_node"] + ) pipeline_parallel_size = 1 else: tensor_parallel_size = self.params["gpus_per_node"]