Skip to content

Commit a6e5432

Browse files
committed
Move slurm templates in new file for readibility and clarity, add templates for batch mode scripts
1 parent 1169151 commit a6e5432

File tree

2 files changed

+248
-152
lines changed

2 files changed

+248
-152
lines changed

vec_inf/client/_client_vars.py

Lines changed: 3 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
"""Global variables for Vector Inference.
22
33
This module contains configuration constants and templates used throughout the
4-
Vector Inference package, including SLURM script templates, model configurations,
5-
and metric definitions.
4+
Vector Inference package, including model configurations, and metric definitions.
65
76
Constants
87
---------
@@ -19,14 +18,6 @@
1918
"""
2019

2120
from pathlib import Path
22-
from typing import TypedDict
23-
24-
from vec_inf.client.slurm_vars import (
25-
LD_LIBRARY_PATH,
26-
SINGULARITY_IMAGE,
27-
SINGULARITY_LOAD_CMD,
28-
VLLM_NCCL_SO_PATH,
29-
)
3021

3122

3223
MODEL_READY_SIGNATURE = "INFO: Application startup complete."
@@ -77,145 +68,5 @@
7768
"-q": "--quantization",
7869
}
7970

80-
# Slurm script templates
81-
class ShebangConfig(TypedDict):
82-
"""TypedDict for SLURM script shebang configuration.
83-
84-
Parameters
85-
----------
86-
base : str
87-
Base shebang line for all SLURM scripts
88-
multinode : list[str]
89-
Additional SLURM directives for multi-node configurations
90-
"""
91-
92-
base: str
93-
multinode: list[str]
94-
95-
96-
class ServerSetupConfig(TypedDict):
97-
"""TypedDict for server setup configuration.
98-
99-
Parameters
100-
----------
101-
single_node : list[str]
102-
Setup commands for single-node deployments
103-
multinode : list[str]
104-
Setup commands for multi-node deployments, including Ray initialization
105-
"""
106-
107-
single_node: list[str]
108-
multinode: list[str]
109-
110-
111-
class SlurmScriptTemplate(TypedDict):
112-
"""TypedDict for complete SLURM script template configuration.
113-
114-
Parameters
115-
----------
116-
shebang : ShebangConfig
117-
Shebang and SLURM directive configuration
118-
singularity_setup : list[str]
119-
Commands for Singularity container setup
120-
imports : str
121-
Import statements and source commands
122-
env_vars : list[str]
123-
Environment variables to set
124-
singularity_command : str
125-
Template for Singularity execution command
126-
activate_venv : str
127-
Template for virtual environment activation
128-
server_setup : ServerSetupConfig
129-
Server initialization commands for different deployment modes
130-
find_vllm_port : list[str]
131-
Commands to find available ports for vLLM server
132-
write_to_json : list[str]
133-
Commands to write server configuration to JSON
134-
launch_cmd : list[str]
135-
vLLM server launch commands
136-
"""
137-
138-
shebang: ShebangConfig
139-
singularity_setup: list[str]
140-
imports: str
141-
env_vars: list[str]
142-
singularity_command: str
143-
activate_venv: str
144-
server_setup: ServerSetupConfig
145-
find_vllm_port: list[str]
146-
write_to_json: list[str]
147-
launch_cmd: list[str]
148-
149-
150-
SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = {
151-
"shebang": {
152-
"base": "#!/bin/bash",
153-
"multinode": [
154-
"#SBATCH --exclusive",
155-
"#SBATCH --tasks-per-node=1",
156-
],
157-
},
158-
"singularity_setup": [
159-
SINGULARITY_LOAD_CMD,
160-
f"singularity exec {SINGULARITY_IMAGE} ray stop",
161-
],
162-
"imports": "source {src_dir}/find_port.sh",
163-
"env_vars": [
164-
f"export LD_LIBRARY_PATH={LD_LIBRARY_PATH}",
165-
f"export VLLM_NCCL_SO_PATH={VLLM_NCCL_SO_PATH}",
166-
],
167-
"singularity_command": f"singularity exec --nv --bind {{model_weights_path}}{{additional_binds}} --containall {SINGULARITY_IMAGE}",
168-
"activate_venv": "source {venv}/bin/activate",
169-
"server_setup": {
170-
"single_node": [
171-
"\n# Find available port",
172-
"head_node_ip=${SLURMD_NODENAME}",
173-
],
174-
"multinode": [
175-
"\n# Get list of nodes",
176-
'nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")',
177-
"nodes_array=($nodes)",
178-
"head_node=${nodes_array[0]}",
179-
'head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)',
180-
"\n# Start Ray head node",
181-
"head_node_port=$(find_available_port $head_node_ip 8080 65535)",
182-
"ray_head=$head_node_ip:$head_node_port",
183-
'echo "Ray Head IP: $ray_head"',
184-
'echo "Starting HEAD at $head_node"',
185-
'srun --nodes=1 --ntasks=1 -w "$head_node" \\',
186-
" SINGULARITY_PLACEHOLDER \\",
187-
' ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \\',
188-
' --num-cpus "$SLURM_CPUS_PER_TASK" --num-gpus "$SLURM_GPUS_PER_NODE" --block &',
189-
"sleep 10",
190-
"\n# Start Ray worker nodes",
191-
"worker_num=$((SLURM_JOB_NUM_NODES - 1))",
192-
"for ((i = 1; i <= worker_num; i++)); do",
193-
" node_i=${nodes_array[$i]}",
194-
' echo "Starting WORKER $i at $node_i"',
195-
' srun --nodes=1 --ntasks=1 -w "$node_i" \\',
196-
" SINGULARITY_PLACEHOLDER \\",
197-
' ray start --address "$ray_head" \\',
198-
' --num-cpus "$SLURM_CPUS_PER_TASK" --num-gpus "$SLURM_GPUS_PER_NODE" --block &',
199-
" sleep 5",
200-
"done",
201-
],
202-
},
203-
"find_vllm_port": [
204-
"\nvllm_port_number=$(find_available_port $head_node_ip 8080 65535)",
205-
'server_address="http://${head_node_ip}:${vllm_port_number}/v1"',
206-
],
207-
"write_to_json": [
208-
'\njson_path="{log_dir}/{model_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"',
209-
'jq --arg server_addr "$server_address" \\',
210-
" '. + {{\"server_address\": $server_addr}}' \\",
211-
' "$json_path" > temp.json \\',
212-
' && mv temp.json "$json_path"',
213-
],
214-
"launch_cmd": [
215-
"vllm serve {model_weights_path} \\",
216-
" --served-model-name {model_name} \\",
217-
' --host "0.0.0.0" \\',
218-
" --port $vllm_port_number \\",
219-
" --trust-remote-code \\",
220-
],
221-
}
71+
# Required matching arguments for batch mode
72+
BATCH_MODE_REQUIRED_MATCHING_ARGS = ["venv", "log_dir"]

0 commit comments

Comments
 (0)