diff --git a/.gitignore b/.gitignore index c2acd1cd921..c588d39d9ba 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ ad-test-workspace/ */tllm_debug/** *.patch !cpp/tensorrt_llm/deep_ep/*.patch +examples/disaggregated/slurm/benchmark/logs/ # Generated files cpp/include/tensorrt_llm/executor/version.h diff --git a/examples/disaggregated/slurm/benchmark/accuracy_eval.sh b/examples/disaggregated/slurm/benchmark/accuracy_eval.sh deleted file mode 100644 index 028b5344a9d..00000000000 --- a/examples/disaggregated/slurm/benchmark/accuracy_eval.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Parse arguments -full_logdir=${1} -accuracy_model=${2} -accuracy_tasks=${3} -model_path=${4} -model_args_extra=${5} -output_dir=${6} -hostname=${7} -port=${8} - -echo "Starting accuracy evaluation..." -echo "Log directory: ${full_logdir}" - -echo "Hostname: ${hostname}, Port: ${port}" -base_url="http://${hostname}:${port}/v1/completions" -echo "Using base_url: ${base_url}" - -# Install lm_eval and run evaluation -echo "Installing lm_eval[api] and running evaluation..." -pip install lm_eval[api]==0.4.8 - -echo "Running lm_eval with tasks: ${accuracy_tasks}..." - -mkdir -p ${output_dir} -lm_eval --model ${accuracy_model} \ - --tasks ${accuracy_tasks} \ - --model_args model=${model_path},base_url=${base_url},${model_args_extra} \ - --output_path ${output_dir} --log_samples \ - --trust_remote_code - -echo "Accuracy evaluation completed successfully" diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm index 597fd51911d..1938db569ff 100644 --- a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm +++ b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm @@ -86,7 +86,7 @@ if [ -n "${trtllm_wheel_path}" ]; then if ! srun --container-name=${container_name} \ --container-mounts=${container_mount} --no-container-mount-home \ --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \ - bash -c "pip install ${trtllm_wheel_path}" \ + bash -c "pip install ${trtllm_wheel_path}[devel]" \ &> ${full_logdir}/2_install.log; then cleanup_on_failure "TensorRT-LLM wheel installation failed. Check ${full_logdir}/2_install.log for details" fi @@ -117,7 +117,7 @@ elif [ -d "${trtllm_repo}" ]; then if ! srun --container-name=${container_name} \ --container-mounts=${container_mount} --no-container-mount-home \ --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \ - bash -c "cd ${trtllm_repo} && pip install -e ." \ + bash -c "cd ${trtllm_repo} && pip install -e .[devel]" \ &> ${full_logdir}/2_install.log; then cleanup_on_failure "TensorRT-LLM installation failed. Check ${full_logdir}/2_install.log for details" fi diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index a12e675134b..d9bbb9612aa 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -212,7 +212,7 @@ def submit_job(config, log_dir, dry_run): # Create base log directory path date_prefix = datetime.now().strftime("%Y%m%d") log_base = os.path.join(env_config['work_dir'], - f"{date_prefix}/{isl}-{osl}") + f"logs/{date_prefix}/{isl}-{osl}") # Get eplb num_slots for gen worker load_balancer_config = worker_config['gen'].get('moe_config', {}).get( @@ -340,27 +340,50 @@ def submit_job(config, log_dir, dry_run): f"--container-mounts={env_config['container_mount']}", f"--mpi=pmix --overlap -N 1 -n 1", ] + env_var = config['benchmark'].get('env_var', '') + benchmark_prefix = client_slurm_prefix + [f"--export \"{env_var}\""] if benchmark_config['use_nv_sa_benchmark']: benchmark_cmd = [ f"bash {env_config['work_dir']}/run_benchmark_nv_sa.sh", f"'{env_config['model_path']}' {isl} {osl} {benchmark_config['benchmark_ratio']} {benchmark_config['multi_round']} {gen_num} '{benchmark_config['concurrency_list']}' {benchmark_config['streaming']} '{log_dir}' {disagg_server_hostname} {disagg_server_port}", f"&> {log_dir}/6_bench.log" ] - client_cmds.append(" ".join(client_slurm_prefix + benchmark_cmd)) + client_cmds.append(" ".join(benchmark_prefix + benchmark_cmd)) else: benchmark_cmd = [ f"bash {env_config['work_dir']}/run_benchmark.sh", f"'{env_config['model_path']}' '{benchmark_config['dataset_file']}' {benchmark_config['multi_round']} {gen_num} '{benchmark_config['concurrency_list']}' {benchmark_config['streaming']} '{log_dir}' {disagg_server_hostname} {disagg_server_port}", f"&> {log_dir}/6_bench.log" ] - client_cmds.append(" ".join(client_slurm_prefix + benchmark_cmd)) + client_cmds.append(" ".join(benchmark_prefix + benchmark_cmd)) if config['accuracy']['enable_accuracy_test']: - accuracy_cmd = [ - f"bash {env_config['work_dir']}/accuracy_eval.sh", - f"'{log_dir}' '{config['accuracy']['model']}' '{config['accuracy']['tasks']}' '{env_config['model_path']}' '{config['accuracy']['model_args_extra']}' '{log_dir}/accuracy_eval' {disagg_server_hostname} {disagg_server_port}", - f"&> {log_dir}/7_accuracy_eval.log" - ] - client_cmds.append(" ".join(client_slurm_prefix + accuracy_cmd)) + env_var = config['accuracy'].get('env_var', '') + accuracy_prefix = client_slurm_prefix + [f"--export \"{env_var}\""] + for task in config['accuracy']['tasks']: + extra_kwargs = config['accuracy']['tasks'][task].get('extra_kwargs', {}) + extra_kwargs_str = "" + for key, value in extra_kwargs.items(): + if isinstance(value, bool): + if value: + extra_kwargs_str += f" --{key}" + else: + extra_kwargs_str += f" --{key}='{value}'" + end_point_map = { + 'local-completions': 'v1/completions', + 'local-chat-completions': 'v1/chat/completions', + } + model = config['accuracy']['tasks'][task]['model'] + accuracy_cmd = [ + 'lm_eval', + '--model', model, + '--tasks', task, + '--model_args', f"model={env_config['model_path']},base_url=http://{disagg_server_hostname}:{disagg_server_port}/{end_point_map[model]},{config['accuracy']['tasks'][task]['model_args_extra']}", + '--log_samples', + '--output_path', f'{log_dir}/accuracy_eval_{task}', + extra_kwargs_str, + f"&> {log_dir}/7_accuracy_eval_{task}.log" + ] + client_cmds.append(" ".join(accuracy_prefix + accuracy_cmd)) with open(os.path.join(log_dir, "client_cmds.sh"), "w") as f: f.write("\n".join(client_cmds) + "\n")