|
1 | 1 | """Global variables for Vector Inference. |
2 | 2 |
|
3 | 3 | This module contains configuration constants and templates used throughout the |
4 | | -Vector Inference package, including SLURM script templates, model configurations, |
5 | | -and metric definitions. |
| 4 | +Vector Inference package, including model configurations, and metric definitions. |
6 | 5 |
|
7 | 6 | Constants |
8 | 7 | --------- |
|
19 | 18 | """ |
20 | 19 |
|
21 | 20 | from pathlib import Path |
22 | | -from typing import TypedDict |
23 | | - |
24 | | -from vec_inf.client.slurm_vars import ( |
25 | | - LD_LIBRARY_PATH, |
26 | | - SINGULARITY_IMAGE, |
27 | | - SINGULARITY_LOAD_CMD, |
28 | | - VLLM_NCCL_SO_PATH, |
29 | | -) |
30 | 21 |
|
31 | 22 |
|
32 | 23 | MODEL_READY_SIGNATURE = "INFO: Application startup complete." |
|
77 | 68 | "-q": "--quantization", |
78 | 69 | } |
79 | 70 |
|
80 | | -# Slurm script templates |
81 | | -class ShebangConfig(TypedDict): |
82 | | - """TypedDict for SLURM script shebang configuration. |
83 | | -
|
84 | | - Parameters |
85 | | - ---------- |
86 | | - base : str |
87 | | - Base shebang line for all SLURM scripts |
88 | | - multinode : list[str] |
89 | | - Additional SLURM directives for multi-node configurations |
90 | | - """ |
91 | | - |
92 | | - base: str |
93 | | - multinode: list[str] |
94 | | - |
95 | | - |
96 | | -class ServerSetupConfig(TypedDict): |
97 | | - """TypedDict for server setup configuration. |
98 | | -
|
99 | | - Parameters |
100 | | - ---------- |
101 | | - single_node : list[str] |
102 | | - Setup commands for single-node deployments |
103 | | - multinode : list[str] |
104 | | - Setup commands for multi-node deployments, including Ray initialization |
105 | | - """ |
106 | | - |
107 | | - single_node: list[str] |
108 | | - multinode: list[str] |
109 | | - |
110 | | - |
111 | | -class SlurmScriptTemplate(TypedDict): |
112 | | - """TypedDict for complete SLURM script template configuration. |
113 | | -
|
114 | | - Parameters |
115 | | - ---------- |
116 | | - shebang : ShebangConfig |
117 | | - Shebang and SLURM directive configuration |
118 | | - singularity_setup : list[str] |
119 | | - Commands for Singularity container setup |
120 | | - imports : str |
121 | | - Import statements and source commands |
122 | | - env_vars : list[str] |
123 | | - Environment variables to set |
124 | | - singularity_command : str |
125 | | - Template for Singularity execution command |
126 | | - activate_venv : str |
127 | | - Template for virtual environment activation |
128 | | - server_setup : ServerSetupConfig |
129 | | - Server initialization commands for different deployment modes |
130 | | - find_vllm_port : list[str] |
131 | | - Commands to find available ports for vLLM server |
132 | | - write_to_json : list[str] |
133 | | - Commands to write server configuration to JSON |
134 | | - launch_cmd : list[str] |
135 | | - vLLM server launch commands |
136 | | - """ |
137 | | - |
138 | | - shebang: ShebangConfig |
139 | | - singularity_setup: list[str] |
140 | | - imports: str |
141 | | - env_vars: list[str] |
142 | | - singularity_command: str |
143 | | - activate_venv: str |
144 | | - server_setup: ServerSetupConfig |
145 | | - find_vllm_port: list[str] |
146 | | - write_to_json: list[str] |
147 | | - launch_cmd: list[str] |
148 | | - |
149 | | - |
150 | | -SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = { |
151 | | - "shebang": { |
152 | | - "base": "#!/bin/bash", |
153 | | - "multinode": [ |
154 | | - "#SBATCH --exclusive", |
155 | | - "#SBATCH --tasks-per-node=1", |
156 | | - ], |
157 | | - }, |
158 | | - "singularity_setup": [ |
159 | | - SINGULARITY_LOAD_CMD, |
160 | | - f"singularity exec {SINGULARITY_IMAGE} ray stop", |
161 | | - ], |
162 | | - "imports": "source {src_dir}/find_port.sh", |
163 | | - "env_vars": [ |
164 | | - f"export LD_LIBRARY_PATH={LD_LIBRARY_PATH}", |
165 | | - f"export VLLM_NCCL_SO_PATH={VLLM_NCCL_SO_PATH}", |
166 | | - ], |
167 | | - "singularity_command": f"singularity exec --nv --bind {{model_weights_path}}{{additional_binds}} --containall {SINGULARITY_IMAGE}", |
168 | | - "activate_venv": "source {venv}/bin/activate", |
169 | | - "server_setup": { |
170 | | - "single_node": [ |
171 | | - "\n# Find available port", |
172 | | - "head_node_ip=${SLURMD_NODENAME}", |
173 | | - ], |
174 | | - "multinode": [ |
175 | | - "\n# Get list of nodes", |
176 | | - 'nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")', |
177 | | - "nodes_array=($nodes)", |
178 | | - "head_node=${nodes_array[0]}", |
179 | | - 'head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)', |
180 | | - "\n# Start Ray head node", |
181 | | - "head_node_port=$(find_available_port $head_node_ip 8080 65535)", |
182 | | - "ray_head=$head_node_ip:$head_node_port", |
183 | | - 'echo "Ray Head IP: $ray_head"', |
184 | | - 'echo "Starting HEAD at $head_node"', |
185 | | - 'srun --nodes=1 --ntasks=1 -w "$head_node" \\', |
186 | | - " SINGULARITY_PLACEHOLDER \\", |
187 | | - ' ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \\', |
188 | | - ' --num-cpus "$SLURM_CPUS_PER_TASK" --num-gpus "$SLURM_GPUS_PER_NODE" --block &', |
189 | | - "sleep 10", |
190 | | - "\n# Start Ray worker nodes", |
191 | | - "worker_num=$((SLURM_JOB_NUM_NODES - 1))", |
192 | | - "for ((i = 1; i <= worker_num; i++)); do", |
193 | | - " node_i=${nodes_array[$i]}", |
194 | | - ' echo "Starting WORKER $i at $node_i"', |
195 | | - ' srun --nodes=1 --ntasks=1 -w "$node_i" \\', |
196 | | - " SINGULARITY_PLACEHOLDER \\", |
197 | | - ' ray start --address "$ray_head" \\', |
198 | | - ' --num-cpus "$SLURM_CPUS_PER_TASK" --num-gpus "$SLURM_GPUS_PER_NODE" --block &', |
199 | | - " sleep 5", |
200 | | - "done", |
201 | | - ], |
202 | | - }, |
203 | | - "find_vllm_port": [ |
204 | | - "\nvllm_port_number=$(find_available_port $head_node_ip 8080 65535)", |
205 | | - 'server_address="http://${head_node_ip}:${vllm_port_number}/v1"', |
206 | | - ], |
207 | | - "write_to_json": [ |
208 | | - '\njson_path="{log_dir}/{model_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"', |
209 | | - 'jq --arg server_addr "$server_address" \\', |
210 | | - " '. + {{\"server_address\": $server_addr}}' \\", |
211 | | - ' "$json_path" > temp.json \\', |
212 | | - ' && mv temp.json "$json_path"', |
213 | | - ], |
214 | | - "launch_cmd": [ |
215 | | - "vllm serve {model_weights_path} \\", |
216 | | - " --served-model-name {model_name} \\", |
217 | | - ' --host "0.0.0.0" \\', |
218 | | - " --port $vllm_port_number \\", |
219 | | - " --trust-remote-code \\", |
220 | | - ], |
221 | | -} |
| 71 | +# Required matching arguments for batch mode |
| 72 | +BATCH_MODE_REQUIRED_MATCHING_ARGS = ["venv", "log_dir"] |
0 commit comments