[None] [fix] Fix slrum scripts (#10007)

kaiyux · web-flow · commit ef4ea955b20c · 2025-12-15T04:20:53.000-08:00
Signed-off-by: Kaiyu Xie &lt;26294424+kaiyux@users.noreply.github.com&gt;
diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
@@ -16,6 +16,7 @@ while [[ $# -gt 0 ]]; do
         --benchmark-ratio) benchmark_ratio="$2"; shift 2 ;;
         --streaming) streaming="$2"; shift 2 ;;
         --use-nv-sa-benchmark) use_nv_sa_benchmark="$2"; shift 2 ;;
+        --benchmark-mode) benchmark_mode="$2"; shift 2 ;;
 
         # Environment and paths
         --dataset-file) dataset_file="$2"; shift 2 ;;
@@ -59,6 +60,7 @@ echo "  multi_round: ${multi_round}"
 echo "  benchmark_ratio: ${benchmark_ratio}"
 echo "  streaming: ${streaming}"
 echo "  use_nv_sa_benchmark: ${use_nv_sa_benchmark}"
+echo "  benchmark_mode: ${benchmark_mode}"
 echo
 echo "Environment Configuration:"
 echo "  dataset_file: ${dataset_file}"
diff --git a/examples/disaggregated/slurm/benchmark/gen_server_config.py b/examples/disaggregated/slurm/benchmark/gen_server_config.py
@@ -78,10 +78,8 @@
         'port': args.server_port,
         'backend': 'pytorch',
         'context_servers': {
-            'num_instances':
-            0 if gen_only else args.num_ctx_servers,
-            'urls': [] if gen_only else
-            [f'{host}:{args.worker_port}' for host in ctx_hostnames]
+            'num_instances': 0 if gen_only else args.num_ctx_servers,
+            'urls': [] if gen_only else ctx_urls
         },
         'generation_servers': {
             'num_instances': args.num_gen_servers,
diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
@@ -26,10 +26,10 @@ def parse_args():
                        '--dir',
                        type=str,
                        help='Directory containing YAML configuration files')
-    group.add_argument('--log-dir',
-                       type=str,
-                       default=None,
-                       help='Log directory')
+    parser.add_argument('--log-dir',
+                        type=str,
+                        default=None,
+                        help='Log directory')
     return parser.parse_args()
 
 
@@ -154,16 +154,20 @@ def submit_job(config, log_dir):
                               {}).get('num_nextn_predict_layers', 0)
 
     # Calculate nodes based on world sizes
-    ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size']
-    ctx_cp_size = config['worker_config']['ctx']['context_parallel_size']
-    ctx_pp_size = config['worker_config']['ctx']['pipeline_parallel_size']
+    ctx_tp_size = config['worker_config']['ctx'].get('tensor_parallel_size', 1)
+    ctx_cp_size = config['worker_config']['ctx'].get('context_parallel_size', 1)
+    ctx_pp_size = config['worker_config']['ctx'].get('pipeline_parallel_size',
+                                                     1)
     ctx_world_size = ctx_tp_size * ctx_cp_size * ctx_pp_size
     ctx_nodes = calculate_nodes(ctx_world_size, ctx_num, gpus_per_node)
-    gen_tp_size = config['worker_config']['gen']['tensor_parallel_size']
-    gen_cp_size = config['worker_config']['gen']['context_parallel_size']
-    gen_pp_size = config['worker_config']['gen']['pipeline_parallel_size']
+
+    gen_tp_size = config['worker_config']['gen'].get('tensor_parallel_size', 1)
+    gen_cp_size = config['worker_config']['gen'].get('context_parallel_size', 1)
+    gen_pp_size = config['worker_config']['gen'].get('pipeline_parallel_size',
+                                                     1)
     gen_world_size = gen_tp_size * gen_cp_size * gen_pp_size
     gen_nodes = calculate_nodes(gen_world_size, gen_num, gpus_per_node)
+
     total_nodes = ctx_nodes + gen_nodes
     total_tasks = total_nodes * gpus_per_node
 
@@ -259,7 +263,7 @@ def submit_job(config, log_dir):
             str(allocation["port"]),
             config['benchmark']['mode'],
             config['benchmark']['concurrency_list'],
-            str(slurm_config['numa_bind']),
+            str(slurm_config['numa_bind']).lower(),
             log_dir,
             str(profiling_config['nsys_on']).lower(),
             profiling_config['gen_profile_range']
@@ -303,6 +307,7 @@ def submit_job(config, log_dir):
         '--benchmark-ratio', str(config['benchmark']['benchmark_ratio']),
         '--streaming', str(config['benchmark']['streaming']).lower(),
         '--use-nv-sa-benchmark', str(config['benchmark']['use_nv_sa_benchmark']).lower(),
+        '--benchmark-mode', config['benchmark']['mode'],
 
         # Environment and paths
         '--dataset-file', config['benchmark']['dataset_file'],