[Bug] Freeze in 2-node GRPO training with rollout server

## Describe the bug
I am running a 2-node GRPO training job for Qwen3-8B.

* Node 1 (VLLM_NODE) runs swift rollout as the vLLM inference server.
* Node 2 (TRAIN_NODE) runs swift rlhf as the trainer, configured with --vllm_mode server.

The training starts, and the until curl... loop correctly waits for the server to be ready. The swift rlhf process then begins. The vLLM server log shows that it successfully receives and processes requests from the trainer (e.g., POST /update_flattened_params/ HTTP/1.1" 200 OK).

However, after a few of these parameter updates, the entire process freezes. The server stops logging new requests, and the trainer process hangs indefinitely without crashing or logging new information.

It would be greatly appreciated if you could provide some guidance on how to handle this situation.

### Slurm Script
```bash
#!/bin/bash
#SBATCH --nodes=2
# ... (other slurm settings) ...

set -exu

NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST))
TRAIN_NODE="${NODELIST[0]}"
VLLM_NODE="${NODELIST[1]}"
VLLM_PORT=8000

source .venv/bin/activate

# --- 1. vLLM Server ---
CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
srun --nodes=1 --ntasks=1 --nodelist="${VLLM_NODE}" \
    swift rollout \
        --model path/to/Qwen3-8B/base \
        --model_type qwen3 \
        --max_turns 5 \
        --use_gym_env true \
        --gym_env red_teaming_gym \
        --external_plugins path/to/plugin.py \
        --host $VLLM_NODE \
        --port $VLLM_PORT \
        --vllm_tensor_parallel_size 8 \
        --vllm_use_async_engine true \
        --vllm_enable_lora true \
        --vllm_max_lora_rank 32 \
        --vllm_gpu_memory_utilization 0.8 \
        --vllm_mm_processor_cache_gb 0 & 

until curl -s http://${VLLM_NODE}:${VLLM_PORT}/health > /dev/null
do
    echo "Waiting vLLM Server..."
    sleep 10
done
echo "✅ vLLM server is ready."

# --- 2. GRPO Trainer ---
CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
NPROC_PER_NODE=8 \
srun --nodes=1 --ntasks=1 --nodelist="${TRAIN_NODE}" \
    swift rlhf \
        --rlhf_type grpo \
        --model path/to/Qwen3-8B/base \
        --model_type qwen3 \
        --use_vllm true \
        --vllm_mode server \
        --vllm_server_host $VLLM_NODE \
        --vllm_server_port $VLLM_PORT \
        --vllm_server_timeout 600 \
        --vllm_server_pass_dataset true \
        --vllm_enable_lora true \
        --vllm_use_async_engine true \
        --async_generate true \
        --train_type lora \
        --lora_rank 32 \
        --lora_alpha 32 \
        --torch_dtype bfloat16 \
        --dataset path/to/train_swift.jsonl \
        --load_from_cache_file true \
        --split_dataset_ratio 0 \
        --max_completion_length 2048 \
        --num_train_epochs 2 \
        --per_device_train_batch_size 2 \
        --learning_rate 1e-6 \
        --gradient_accumulation_steps 2 \
        --steps_per_generation 2 \
        --save_total_limit 2 \
        --logging_steps 1 \
        --warmup_ratio 0.05 \
        --dataloader_num_workers 4 \
        --dataset_num_proc 4 \
        --num_generations 8 \
        --temperature 1.0 \
        --top_p 0.9 \
        --top_k 50 \
        --log_completions true \
        --num_iterations 1 \
        --beta 0 \
        --loss_scale default \
        --reward_model_type gym \
        --external_plugins path/to/plugin.py \
        --output_dir .swift-output \
        --deepspeed zero3 \
        --max_steps 1000 \
        --report_to wandb
```

### Log (Last Lines)
```
...
INFO:     x.x.x.x:54402 - "POST /update_flattened_params/ HTTP/1.1" 200 OK
INFO:     x.x.x.x:54402 - "POST /update_flattened_params/ HTTP/1.1" 200 OK
INFO:     x.x.x.x:54402 - "POST /update_flattened_params/ HTTP/1.1" 200 OK
INFO:     x.x.x.x:54402 - "POST /update_flattened_params/ HTTP/1.1" 200 OK
INFO:     x.x.x.x:54402 - "POST /update_flattened_params/ HTTP/1.1" 200 OK
[1;36m(EngineCore_DP0 pid=xxxxxx)[0;0m INFO 10-31 19:26:29 [block_pool.py:292] Successfully reset prefix cache
INFO:     x.x.x.x:54402 - "POST /reset_prefix_cache/ HTTP/1.1" 200 OK
(No further logs. Process hangs here)
```

### Dependencies
```
requires-python = ">=3.11"
dependencies = [
    "bitsandbytes>=0.48.1",
    "hydra-core>=1.3.2",
    "optimum[onnxruntime]>=1.17.1",
    "peft>=0.17.1",
    "pydantic>=2.12.3",
    "trl[vllm]==0.23.0",
    "vllm==0.10.2",
    "deepspeed>=0.18.0",
    "scikit-learn>=1.7.2",
    "unsloth==2025.10.6",
    "intel-extension-for-pytorch>=2.8.0",
    "flash-attn",
    "wandb>=0.22.2",
    "fastapi[standard]>=0.119.1",
    "flask>=3.1.2",
    "ms-swift",
    "verl>=0.6.0",
    "math-verify>=0.8.0",
    "verifiers[rl]>=0.1.6.post0",
]

[tool.uv.sources]
ms-swift = { git = "https://github.com/modelscope/ms-swift.git", rev = "b773b66" }
```



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Bug] Freeze in 2-node GRPO training with rollout server #6386

Describe the bug

Slurm Script

Log (Last Lines)

Dependencies

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[Bug] Freeze in 2-node GRPO training with rollout server #6386

Description

Describe the bug

Slurm Script

Log (Last Lines)

Dependencies

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions