|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | +""" |
| 4 | +experimental support for data-parallel inference with torchrun |
| 5 | +Note the data load balancing and distribution is done out of the vllm engine, |
| 6 | +no internal lb supported in external_launcher mode. |
| 7 | +""" |
| 8 | + |
| 9 | +from vllm import LLM, SamplingParams |
| 10 | + |
| 11 | +# Create prompts, the same across all ranks |
| 12 | +prompts = [ |
| 13 | + "Hello, my name is", |
| 14 | + "The president of the United States is", |
| 15 | + "The capital of France is", |
| 16 | + "The future of AI is", |
| 17 | +] * 50 |
| 18 | + |
| 19 | +# Create sampling parameters, the same across all ranks |
| 20 | +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) |
| 21 | + |
| 22 | +# Use `distributed_executor_backend="external_launcher"` so that |
| 23 | +# this llm engine/instance only creates one worker. |
| 24 | +# it is important to set an explicit seed to make sure that |
| 25 | +# all ranks have the same random seed, so that sampling can be |
| 26 | +# deterministic across ranks. |
| 27 | +llm = LLM( |
| 28 | + model="/data/local/models/oss/qwen1.5_2.7B_moe_chat", |
| 29 | + tensor_parallel_size=2, |
| 30 | + data_parallel_size=4, |
| 31 | + pipeline_parallel_size=1, |
| 32 | + enable_expert_parallel=True, |
| 33 | + distributed_executor_backend="external_launcher", |
| 34 | + max_model_len=32768, |
| 35 | + # FIXME: with torch.compile, the torchrun processes do not exit properly |
| 36 | + enforce_eager=True, |
| 37 | + seed=1, |
| 38 | +) |
| 39 | + |
| 40 | +dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank |
| 41 | +dp_size = llm.llm_engine.vllm_config.parallel_config.data_parallel_size |
| 42 | + |
| 43 | +prompts = [ |
| 44 | + f"{idx}.{prompt}" for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank |
| 45 | +] |
| 46 | + |
| 47 | +outputs = llm.generate(prompts, sampling_params) |
| 48 | + |
| 49 | + |
| 50 | +# all ranks will have the same outputs |
| 51 | +print("-" * 50) |
| 52 | +for output in outputs: |
| 53 | + prompt = output.prompt |
| 54 | + generated_text = output.outputs[0].text |
| 55 | + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n") |
| 56 | + print("-" * 50) |
| 57 | + |
| 58 | +""" |
| 59 | +Further tips: |
| 60 | +
|
| 61 | +1. to communicate control messages across all ranks, use the cpu group, |
| 62 | +a PyTorch ProcessGroup with GLOO backend. |
| 63 | +
|
| 64 | +```python |
| 65 | +from vllm.distributed.parallel_state import get_world_group |
| 66 | +cpu_group = get_world_group().cpu_group |
| 67 | +torch_rank = dist.get_rank(group=cpu_group) |
| 68 | +if torch_rank == 0: |
| 69 | + # do something for rank 0, e.g. saving the results to disk. |
| 70 | +``` |
| 71 | +
|
| 72 | +2. to communicate data across all ranks, use the model's device group, |
| 73 | +a PyTorch ProcessGroup with NCCL backend. |
| 74 | +```python |
| 75 | +from vllm.distributed.parallel_state import get_world_group |
| 76 | +device_group = get_world_group().device_group |
| 77 | +``` |
| 78 | +
|
| 79 | +3. to access the model directly in every rank, use the following code: |
| 80 | +```python |
| 81 | +llm.llm_engine.model_executor.driver_worker.worker.model_runner.model |
| 82 | +``` |
| 83 | +""" |
0 commit comments