diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py index e249c636..155b0913 100644 --- a/tests/vec_inf/cli/test_cli.py +++ b/tests/vec_inf/cli/test_cli.py @@ -226,13 +226,12 @@ def base_patches(test_paths, mock_truediv, debug_helper): "pathlib.Path.parent", return_value=debug_helper.config_file.parent.parent ), patch("pathlib.Path.__truediv__", side_effect=mock_truediv), - patch("pathlib.Path.iterdir", return_value=[]), # Mock empty directory listing + patch("pathlib.Path.iterdir", return_value=[]), patch("json.dump"), patch("pathlib.Path.touch"), patch("vec_inf.client._utils.Path", return_value=test_paths["weights_dir"]), - patch( - "pathlib.Path.home", return_value=Path("/home/user") - ), # Mock home directory + patch("pathlib.Path.home", return_value=Path("/home/user")), + patch("pathlib.Path.rename"), ] @@ -246,25 +245,25 @@ def apply_base_patches(base_patches): yield -def test_launch_command_success(runner, mock_launch_output, path_exists, debug_helper): +def test_launch_command_success( + runner, + mock_launch_output, + path_exists, + debug_helper, + mock_truediv, + test_paths, + base_patches, +): """Test successful model launch with minimal required arguments.""" - test_log_dir = Path("/tmp/test_vec_inf_logs") + with ExitStack() as stack: + # Apply all base patches + for patch_obj in base_patches: + stack.enter_context(patch_obj) + + # Apply specific patches for this test + mock_run = stack.enter_context(patch("vec_inf.client._utils.run_bash_command")) + stack.enter_context(patch("pathlib.Path.exists", new=path_exists)) - with ( - patch("vec_inf.client._utils.run_bash_command") as mock_run, - patch("pathlib.Path.mkdir"), - patch("builtins.open", debug_helper.tracked_mock_open), - patch("pathlib.Path.open", debug_helper.tracked_mock_open), - patch("pathlib.Path.exists", new=path_exists), - patch("pathlib.Path.expanduser", return_value=test_log_dir), - patch("pathlib.Path.resolve", return_value=debug_helper.config_file.parent), - patch( - "pathlib.Path.parent", return_value=debug_helper.config_file.parent.parent - ), - patch("json.dump"), - patch("pathlib.Path.touch"), - patch("pathlib.Path.__truediv__", return_value=test_log_dir), - ): expected_job_id = "14933053" mock_run.return_value = mock_launch_output(expected_job_id) @@ -277,25 +276,24 @@ def test_launch_command_success(runner, mock_launch_output, path_exists, debug_h def test_launch_command_with_json_output( - runner, mock_launch_output, path_exists, debug_helper + runner, + mock_launch_output, + path_exists, + debug_helper, + mock_truediv, + test_paths, + base_patches, ): """Test JSON output format for launch command.""" - test_log_dir = Path("/tmp/test_vec_inf_logs") - with ( - patch("vec_inf.client._utils.run_bash_command") as mock_run, - patch("pathlib.Path.mkdir"), - patch("builtins.open", debug_helper.tracked_mock_open), - patch("pathlib.Path.open", debug_helper.tracked_mock_open), - patch("pathlib.Path.exists", new=path_exists), - patch("pathlib.Path.expanduser", return_value=test_log_dir), - patch("pathlib.Path.resolve", return_value=debug_helper.config_file.parent), - patch( - "pathlib.Path.parent", return_value=debug_helper.config_file.parent.parent - ), - patch("json.dump"), - patch("pathlib.Path.touch"), - patch("pathlib.Path.__truediv__", return_value=test_log_dir), - ): + with ExitStack() as stack: + # Apply all base patches + for patch_obj in base_patches: + stack.enter_context(patch_obj) + + # Apply specific patches for this test + mock_run = stack.enter_context(patch("vec_inf.client._utils.run_bash_command")) + stack.enter_context(patch("pathlib.Path.exists", new=path_exists)) + expected_job_id = "14933051" mock_run.return_value = mock_launch_output(expected_job_id) @@ -319,7 +317,7 @@ def test_launch_command_with_json_output( assert output.get("slurm_job_id") == expected_job_id assert output.get("model_name") == "Meta-Llama-3.1-8B" assert output.get("model_type") == "LLM" - assert str(test_log_dir) in output.get("log_dir", "") + assert str(test_paths["log_dir"]) in output.get("log_dir", "") def test_launch_command_no_model_weights_parent_dir(runner, debug_helper, base_patches): diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py index 9d2872d2..af84fa21 100644 --- a/vec_inf/cli/_helper.py +++ b/vec_inf/cli/_helper.py @@ -1,6 +1,6 @@ """Helper classes for the CLI.""" -import os +from pathlib import Path from typing import Any, Union import click @@ -59,9 +59,10 @@ def format_table_output(self) -> Table: ) if self.params.get("enforce_eager"): table.add_row("Enforce Eager", self.params["enforce_eager"]) - - # Add path details - table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS")) + table.add_row( + "Model Weights Directory", + str(Path(self.params["model_weights_parent_dir"], self.model_name)), + ) table.add_row("Log Directory", self.params["log_dir"]) return table diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py index d5b9481a..0fa5c4c2 100644 --- a/vec_inf/client/_helper.py +++ b/vec_inf/client/_helper.py @@ -25,12 +25,14 @@ ModelType, StatusResponse, ) +from vec_inf.client._slurm_script_generator import SlurmScriptGenerator from vec_inf.client._vars import ( BOOLEAN_FIELDS, LD_LIBRARY_PATH, REQUIRED_FIELDS, + SINGULARITY_IMAGE, SRC_DIR, - VLLM_TASK_MAP, + VLLM_NCCL_SO_PATH, ) @@ -50,6 +52,7 @@ def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]): self.model_name = model_name self.kwargs = kwargs or {} self.slurm_job_id = "" + self.slurm_script_path = Path("") self.model_config = self._get_model_configuration() self.params = self._get_launch_params() @@ -137,31 +140,9 @@ def _get_launch_params(self) -> dict[str, Any]: def _set_env_vars(self) -> None: """Set environment variables for the launch command.""" - os.environ["MODEL_NAME"] = self.model_name - os.environ["MAX_MODEL_LEN"] = self.params["max_model_len"] - os.environ["MAX_LOGPROBS"] = self.params["vocab_size"] - os.environ["DATA_TYPE"] = self.params["data_type"] - os.environ["MAX_NUM_SEQS"] = self.params["max_num_seqs"] - os.environ["GPU_MEMORY_UTILIZATION"] = self.params["gpu_memory_utilization"] - os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]] - os.environ["PIPELINE_PARALLELISM"] = self.params["pipeline_parallelism"] - os.environ["COMPILATION_CONFIG"] = self.params["compilation_config"] - os.environ["SRC_DIR"] = SRC_DIR - os.environ["MODEL_WEIGHTS"] = str( - Path(self.params["model_weights_parent_dir"], self.model_name) - ) os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH - os.environ["VENV_BASE"] = self.params["venv"] - os.environ["LOG_DIR"] = self.params["log_dir"] - - if self.params.get("enable_prefix_caching"): - os.environ["ENABLE_PREFIX_CACHING"] = self.params["enable_prefix_caching"] - if self.params.get("enable_chunked_prefill"): - os.environ["ENABLE_CHUNKED_PREFILL"] = self.params["enable_chunked_prefill"] - if self.params.get("max_num_batched_tokens"): - os.environ["MAX_NUM_BATCHED_TOKENS"] = self.params["max_num_batched_tokens"] - if self.params.get("enforce_eager"): - os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"] + os.environ["VLLM_NCCL_SO_PATH"] = VLLM_NCCL_SO_PATH + os.environ["SINGULARITY_IMAGE"] = SINGULARITY_IMAGE def _build_launch_command(self) -> str: """Construct the full launch command with parameters.""" @@ -187,10 +168,10 @@ def _build_launch_command(self) -> str: ] ) # Add slurm script - slurm_script = "vllm.slurm" - if int(self.params["num_nodes"]) > 1: - slurm_script = "multinode_vllm.slurm" - command_list.append(f"{SRC_DIR}/{slurm_script}") + self.slurm_script_path = SlurmScriptGenerator( + self.params, SRC_DIR + ).write_to_log_dir() + command_list.append(str(self.slurm_script_path)) return " ".join(command_list) def launch(self) -> LaunchResponse: @@ -207,15 +188,22 @@ def launch(self) -> LaunchResponse: self.slurm_job_id = command_output.split(" ")[-1].strip().strip("\n") self.params["slurm_job_id"] = self.slurm_job_id - # Create log directory and job json file + # Create log directory and job json file, move slurm script to job log directory + job_log_dir = Path( + self.params["log_dir"], f"{self.model_name}.{self.slurm_job_id}" + ) + job_log_dir.mkdir(parents=True, exist_ok=True) + job_json = Path( - self.params["log_dir"], - f"{self.model_name}.{self.slurm_job_id}", + job_log_dir, f"{self.model_name}.{self.slurm_job_id}.json", ) - job_json.parent.mkdir(parents=True, exist_ok=True) job_json.touch(exist_ok=True) + self.slurm_script_path.rename( + job_log_dir / f"{self.model_name}.{self.slurm_job_id}.slurm" + ) + with job_json.open("w") as file: json.dump(self.params, file, indent=4) diff --git a/vec_inf/client/_slurm_script_generator.py b/vec_inf/client/_slurm_script_generator.py new file mode 100644 index 00000000..3daabe00 --- /dev/null +++ b/vec_inf/client/_slurm_script_generator.py @@ -0,0 +1,179 @@ +from datetime import datetime +from pathlib import Path +from typing import Any + +from vec_inf.client._vars import VLLM_TASK_MAP + + +class SlurmScriptGenerator: + def __init__(self, params: dict[str, Any], src_dir: str): + self.params = params + self.src_dir = src_dir + self.is_multinode = int(self.params["num_nodes"]) > 1 + self.model_weights_path = str( + Path(params["model_weights_parent_dir"], params["model_name"]) + ) + self.task = VLLM_TASK_MAP[self.params["model_type"]] + + def _generate_script_content(self) -> str: + preamble = self._generate_preamble() + server = self._generate_server_script() + launcher = self._generate_launcher() + args = self._generate_shared_args() + return preamble + server + launcher + args + + def _generate_preamble(self) -> str: + base = [ + "#!/bin/bash", + "#SBATCH --cpus-per-task=16", + "#SBATCH --mem=64G", + ] + if self.is_multinode: + base += [ + "#SBATCH --exclusive", + "#SBATCH --tasks-per-node=1", + ] + base += [""] + return "\n".join(base) + + def _generate_shared_args(self) -> str: + if self.is_multinode and not self.params["pipeline_parallelism"]: + tensor_parallel_size = ( + self.params["num_nodes"] * self.params["gpus_per_node"] + ) + pipeline_parallel_size = 1 + else: + tensor_parallel_size = self.params["gpus_per_node"] + pipeline_parallel_size = self.params["num_nodes"] + + args = [ + f"--model {self.model_weights_path} \\", + f"--served-model-name {self.params['model_name']} \\", + '--host "0.0.0.0" \\', + "--port $vllm_port_number \\", + f"--tensor-parallel-size {tensor_parallel_size} \\", + f"--dtype {self.params['data_type']} \\", + "--trust-remote-code \\", + f"--max-logprobs {self.params['vocab_size']} \\", + f"--max-model-len {self.params['max_model_len']} \\", + f"--max-num-seqs {self.params['max_num_seqs']} \\", + f"--gpu-memory-utilization {self.params['gpu_memory_utilization']} \\", + f"--compilation-config {self.params['compilation_config']} \\", + f"--task {self.task} \\", + ] + if self.is_multinode: + args.insert(4, f"--pipeline-parallel-size {pipeline_parallel_size} \\") + if self.params.get("max_num_batched_tokens"): + args.append( + f"--max-num-batched-tokens={self.params['max_num_batched_tokens']} \\" + ) + if self.params.get("enable_prefix_caching") == "True": + args.append("--enable-prefix-caching \\") + if self.params.get("enable_chunked_prefill") == "True": + args.append("--enable-chunked-prefill \\") + if self.params.get("enforce_eager") == "True": + args.append("--enforce-eager") + + return "\n".join(args) + + def _generate_server_script(self) -> str: + server_script = [""] + if self.params["venv"] == "singularity": + server_script.append("""module load singularity-ce/3.8.2 +singularity exec $SINGULARITY_IMAGE ray stop +""") + server_script.append(f"source {self.src_dir}/find_port.sh\n") + server_script.append( + self._generate_multinode_server_script() + if self.is_multinode + else self._generate_single_node_server_script() + ) + server_script.append(f"""json_path="{self.params["log_dir"]}/{self.params["model_name"]}.$SLURM_JOB_ID/{self.params["model_name"]}.$SLURM_JOB_ID.json" +jq --arg server_addr "$server_address" \\ + '. + {{"server_address": $server_addr}}' \\ + "$json_path" > temp.json \\ + && mv temp.json "$json_path" + +""") + return "\n".join(server_script) + + def _generate_single_node_server_script(self) -> str: + return """hostname=${SLURMD_NODENAME} +vllm_port_number=$(find_available_port ${hostname} 8080 65535) + +server_address="http://${hostname}:${vllm_port_number}/v1" +echo "Server address: $server_address" +""" + + def _generate_multinode_server_script(self) -> str: + server_script = [] + server_script.append("""nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +nodes_array=($nodes) + +head_node=${nodes_array[0]} +head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) + +head_node_port=$(find_available_port $head_node_ip 8080 65535) + +ip_head=$head_node_ip:$head_node_port +export ip_head +echo "IP Head: $ip_head" + +echo "Starting HEAD at $head_node" +srun --nodes=1 --ntasks=1 -w "$head_node" \\""") + + if self.params["venv"] == "singularity": + server_script.append( + f" singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\" + ) + + server_script.append(""" ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \\ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & + +sleep 10 +worker_num=$((SLURM_JOB_NUM_NODES - 1)) + +for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + echo "Starting WORKER $i at $node_i" + srun --nodes=1 --ntasks=1 -w "$node_i" \\""") + + if self.params["venv"] == "singularity": + server_script.append( + f""" singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\""" + ) + server_script.append(""" ray start --address "$ip_head" \\ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & + sleep 5 +done + +vllm_port_number=$(find_available_port $head_node_ip 8080 65535) + +server_address="http://${head_node_ip}:${vllm_port_number}/v1" +echo "Server address: $server_address" + +""") + return "\n".join(server_script) + + def _generate_launcher(self) -> str: + if self.params["venv"] == "singularity": + launcher_script = [ + f"""singularity exec --nv --bind {self.model_weights_path}:{self.model_weights_path} $SINGULARITY_IMAGE \\""" + ] + else: + launcher_script = [f"""source {self.params["venv"]}/bin/activate"""] + launcher_script.append( + """python3.10 -m vllm.entrypoints.openai.api_server \\\n""" + ) + return "\n".join(launcher_script) + + def write_to_log_dir(self) -> Path: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + script_path: Path = ( + Path(self.params["log_dir"]) + / f"launch_{self.params['model_name']}_{timestamp}.slurm" + ) + + content = self._generate_script_content() + script_path.write_text(content) + return script_path diff --git a/vec_inf/client/_vars.py b/vec_inf/client/_vars.py index 71e9e221..b8e5dca6 100644 --- a/vec_inf/client/_vars.py +++ b/vec_inf/client/_vars.py @@ -7,6 +7,8 @@ CACHED_CONFIG = Path("/", "model-weights", "vec-inf-shared", "models.yaml") SRC_DIR = str(Path(__file__).parent.parent) LD_LIBRARY_PATH = "/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/" +VLLM_NCCL_SO_PATH = "/vec-inf/nccl/libnccl.so.2.18.1" +SINGULARITY_IMAGE = "/model-weights/vec-inf-shared/vector-inference_latest.sif" # Maps model types to vLLM tasks VLLM_TASK_MAP = { diff --git a/vec_inf/multinode_vllm.slurm b/vec_inf/multinode_vllm.slurm deleted file mode 100644 index 544ac136..00000000 --- a/vec_inf/multinode_vllm.slurm +++ /dev/null @@ -1,154 +0,0 @@ -#!/bin/bash -#SBATCH --cpus-per-task=16 -#SBATCH --mem=64G -#SBATCH --exclusive -#SBATCH --tasks-per-node=1 - -source ${SRC_DIR}/find_port.sh - -if [ "$VENV_BASE" = "singularity" ]; then - export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif - export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 - module load singularity-ce/3.8.2 - singularity exec $SINGULARITY_IMAGE ray stop -fi - -# Getting the node names -nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -nodes_array=($nodes) - -head_node=${nodes_array[0]} -head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) - -# Find port for head node -head_node_port=$(find_available_port $head_node_ip 8080 65535) - -# Starting the Ray head node -ip_head=$head_node_ip:$head_node_port -export ip_head -echo "IP Head: $ip_head" - -echo "Starting HEAD at $head_node" -if [ "$VENV_BASE" = "singularity" ]; then - srun --nodes=1 --ntasks=1 -w "$head_node" \ - singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \ - ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & -else - srun --nodes=1 --ntasks=1 -w "$head_node" \ - ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & -fi - -# Starting the Ray worker nodes -# Optional, though may be useful in certain versions of Ray < 1.0. -sleep 10 - -# number of nodes other than the head node -worker_num=$((SLURM_JOB_NUM_NODES - 1)) - -for ((i = 1; i <= worker_num; i++)); do - node_i=${nodes_array[$i]} - echo "Starting WORKER $i at $node_i" - if [ "$VENV_BASE" = "singularity" ]; then - srun --nodes=1 --ntasks=1 -w "$node_i" \ - singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \ - ray start --address "$ip_head" \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & - else - srun --nodes=1 --ntasks=1 -w "$node_i" \ - ray start --address "$ip_head" \ - --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & - fi - - sleep 5 -done - - -vllm_port_number=$(find_available_port $head_node_ip 8080 65535) - -SERVER_ADDR="http://${head_node_ip}:${vllm_port_number}/v1" -echo "Server address: $SERVER_ADDR" - -jq --arg server_addr "$SERVER_ADDR" \ - '. + {"server_address": $server_addr}' \ - "$LOG_DIR/$MODEL_NAME.$SLURM_JOB_ID/$MODEL_NAME.$SLURM_JOB_ID.json" > temp.json \ - && mv temp.json "$LOG_DIR/$MODEL_NAME.$SLURM_JOB_ID/$MODEL_NAME.$SLURM_JOB_ID.json" \ - && rm temp.json - -if [ "$PIPELINE_PARALLELISM" = "True" ]; then - export PIPELINE_PARALLEL_SIZE=$SLURM_JOB_NUM_NODES - export TENSOR_PARALLEL_SIZE=$SLURM_GPUS_PER_NODE -else - export PIPELINE_PARALLEL_SIZE=1 - export TENSOR_PARALLEL_SIZE=$((SLURM_JOB_NUM_NODES*SLURM_GPUS_PER_NODE)) -fi - -if [ "$ENFORCE_EAGER" = "True" ]; then - export ENFORCE_EAGER="--enforce-eager" -else - export ENFORCE_EAGER="" -fi - -if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then - export ENABLE_PREFIX_CACHING="--enable-prefix-caching" -else - export ENABLE_PREFIX_CACHING="" -fi - -if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then - export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill" -else - export ENABLE_CHUNKED_PREFILL="" -fi - -if [ -z "$MAX_NUM_BATCHED_TOKENS" ]; then - export MAX_NUM_BATCHED_TOKENS="" -else - export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS" -fi - -# Activate vllm venv -if [ "$VENV_BASE" = "singularity" ]; then - singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \ - python3.10 -m vllm.entrypoints.openai.api_server \ - --model ${MODEL_WEIGHTS} \ - --served-model-name ${MODEL_NAME} \ - --host "0.0.0.0" \ - --port ${vllm_port_number} \ - --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \ - --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ - --dtype ${DATA_TYPE} \ - --trust-remote-code \ - --max-logprobs ${MAX_LOGPROBS} \ - --max-model-len ${MAX_MODEL_LEN} \ - --max-num-seqs ${MAX_NUM_SEQS} \ - --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ - --compilation-config ${COMPILATION_CONFIG} \ - --task ${TASK} \ - ${MAX_NUM_BATCHED_TOKENS} \ - ${ENABLE_PREFIX_CACHING} \ - ${ENABLE_CHUNKED_PREFILL} \ - ${ENFORCE_EAGER} -else - source ${VENV_BASE}/bin/activate - python3 -m vllm.entrypoints.openai.api_server \ - --model ${MODEL_WEIGHTS} \ - --served-model-name ${MODEL_NAME} \ - --host "0.0.0.0" \ - --port ${vllm_port_number} \ - --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \ - --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ - --dtype ${DATA_TYPE} \ - --trust-remote-code \ - --max-logprobs ${MAX_LOGPROBS} \ - --max-model-len ${MAX_MODEL_LEN} \ - --max-num-seqs ${MAX_NUM_SEQS} \ - --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ - --compilation-config ${COMPILATION_CONFIG} \ - --task ${TASK} \ - ${MAX_NUM_BATCHED_TOKENS} \ - ${ENABLE_PREFIX_CACHING} \ - ${ENABLE_CHUNKED_PREFILL} \ - ${ENFORCE_EAGER} -fi diff --git a/vec_inf/vllm.slurm b/vec_inf/vllm.slurm deleted file mode 100644 index e9729d10..00000000 --- a/vec_inf/vllm.slurm +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash -#SBATCH --cpus-per-task=16 -#SBATCH --mem=64G - -source ${SRC_DIR}/find_port.sh - -# Write server url to file -hostname=${SLURMD_NODENAME} -vllm_port_number=$(find_available_port $hostname 8080 65535) - -SERVER_ADDR="http://${hostname}:${vllm_port_number}/v1" -echo "Server address: $SERVER_ADDR" - -jq --arg server_addr "$SERVER_ADDR" \ - '. + {"server_address": $server_addr}' \ - "$LOG_DIR/$MODEL_NAME.$SLURM_JOB_ID/$MODEL_NAME.$SLURM_JOB_ID.json" > temp.json \ - && mv temp.json "$LOG_DIR/$MODEL_NAME.$SLURM_JOB_ID/$MODEL_NAME.$SLURM_JOB_ID.json" \ - && rm temp.json - -if [ "$ENFORCE_EAGER" = "True" ]; then - export ENFORCE_EAGER="--enforce-eager" -else - export ENFORCE_EAGER="" -fi - -if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then - export ENABLE_PREFIX_CACHING="--enable-prefix-caching" -else - export ENABLE_PREFIX_CACHING="" -fi - -if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then - export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill" -else - export ENABLE_CHUNKED_PREFILL="" -fi - -if [ -z "$MAX_NUM_BATCHED_TOKENS" ]; then - export MAX_NUM_BATCHED_TOKENS="" -else - export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS" -fi - -# Activate vllm venv -if [ "$VENV_BASE" = "singularity" ]; then - export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif - export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1 - module load singularity-ce/3.8.2 - singularity exec $SINGULARITY_IMAGE ray stop - singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \ - python3.10 -m vllm.entrypoints.openai.api_server \ - --model ${MODEL_WEIGHTS} \ - --served-model-name ${MODEL_NAME} \ - --host "0.0.0.0" \ - --port ${vllm_port_number} \ - --tensor-parallel-size ${SLURM_GPUS_PER_NODE} \ - --dtype ${DATA_TYPE} \ - --max-logprobs ${MAX_LOGPROBS} \ - --trust-remote-code \ - --max-model-len ${MAX_MODEL_LEN} \ - --max-num-seqs ${MAX_NUM_SEQS} \ - --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ - --compilation-config ${COMPILATION_CONFIG} \ - --task ${TASK} \ - ${MAX_NUM_BATCHED_TOKENS} \ - ${ENABLE_PREFIX_CACHING} \ - ${ENABLE_CHUNKED_PREFILL} \ - ${ENFORCE_EAGER} - -else - source ${VENV_BASE}/bin/activate - python3 -m vllm.entrypoints.openai.api_server \ - --model ${MODEL_WEIGHTS} \ - --served-model-name ${MODEL_NAME} \ - --host "0.0.0.0" \ - --port ${vllm_port_number} \ - --tensor-parallel-size ${SLURM_GPUS_PER_NODE} \ - --dtype ${DATA_TYPE} \ - --max-logprobs ${MAX_LOGPROBS} \ - --trust-remote-code \ - --max-model-len ${MAX_MODEL_LEN} \ - --max-num-seqs ${MAX_NUM_SEQS} \ - --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \ - --compilation-config ${COMPILATION_CONFIG} \ - --task ${TASK} \ - ${MAX_NUM_BATCHED_TOKENS} \ - ${ENABLE_PREFIX_CACHING} \ - ${ENABLE_CHUNKED_PREFILL} \ - ${ENFORCE_EAGER} -fi