|
| 1 | +import multiprocessing as mp |
| 2 | +import socket |
| 3 | +from typing import Any |
| 4 | + |
| 5 | +import pytest |
| 6 | +import torch |
| 7 | +import torch.distributed as dist |
| 8 | + |
| 9 | +import pynvml |
| 10 | + |
| 11 | +from flashinfer.comm.mapping import Mapping |
| 12 | +from flashinfer.comm.mnnvl import MnnvlConfig, MnnvlMemory |
| 13 | +from flashinfer.comm.mnnvl import CommBackend as CommBackend |
| 14 | + |
| 15 | + |
| 16 | +pynvml.nvmlInit() |
| 17 | + |
| 18 | + |
| 19 | +class CustomCommunicator(CommBackend): |
| 20 | + def __init__(self, group): |
| 21 | + self._group = group |
| 22 | + |
| 23 | + def Get_rank(self) -> int: |
| 24 | + return dist.get_rank(self._group) |
| 25 | + |
| 26 | + def Get_size(self) -> int: |
| 27 | + return dist.get_world_size(self._group) |
| 28 | + |
| 29 | + def allgather(self, data: int | bytes): |
| 30 | + device = f"cuda:{torch.cuda.current_device()}" |
| 31 | + if isinstance(data, int): |
| 32 | + local_tensor = torch.tensor([data], device=device, dtype=torch.int32) |
| 33 | + world_size = self.Get_size() |
| 34 | + gathered = [torch.zeros_like(local_tensor) for _ in range(world_size)] |
| 35 | + |
| 36 | + dist.all_gather(gathered, local_tensor, group=self._group) |
| 37 | + return [int(x.item()) for x in gathered] |
| 38 | + |
| 39 | + elif isinstance(data, bytes): |
| 40 | + local_tensor = torch.ByteTensor(list(data)).unsqueeze(0).to(device) |
| 41 | + world_size = self.Get_size() |
| 42 | + gathered = [data] * self.Get_size() |
| 43 | + dist.all_gather_object(gathered, data, group=self._group) |
| 44 | + return gathered |
| 45 | + else: |
| 46 | + raise TypeError(f"Unsupported type for allgather: {type(data)}") |
| 47 | + |
| 48 | + def Split(self, color: int, key: int) -> "CustomCommunicator": |
| 49 | + return self |
| 50 | + |
| 51 | + |
| 52 | +def get_open_port() -> int: |
| 53 | + try: |
| 54 | + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: |
| 55 | + s.bind(("127.0.0.1", 0)) |
| 56 | + return s.getsockname()[1] |
| 57 | + except OSError: |
| 58 | + with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s: |
| 59 | + s.bind(("::1", 0)) |
| 60 | + return s.getsockname()[1] |
| 61 | + |
| 62 | + |
| 63 | +def multi_process_parallel( |
| 64 | + world_size: int, dtype: torch.dtype, test_target: Any, target_args: tuple = () |
| 65 | +) -> None: |
| 66 | + mp.set_start_method("spawn", force=True) |
| 67 | + |
| 68 | + procs = [] |
| 69 | + distributed_init_port = get_open_port() |
| 70 | + for i in range(world_size): |
| 71 | + proc_args = (world_size, i, dtype, distributed_init_port) + target_args |
| 72 | + proc = mp.Process(target=test_target, args=proc_args, name=f"Worker-{i}") |
| 73 | + proc.start() |
| 74 | + procs.append(proc) |
| 75 | + |
| 76 | + for i in range(world_size): |
| 77 | + procs[i].join() |
| 78 | + assert procs[i].exitcode == 0, ( |
| 79 | + f"Process {i} failed with exit code {procs[i].exitcode}" |
| 80 | + ) |
| 81 | + |
| 82 | + |
| 83 | +def align_memory(size: int): |
| 84 | + align_size = 2 * 1024 * 1024 |
| 85 | + return (size + align_size - 1) // align_size * align_size |
| 86 | + |
| 87 | + |
| 88 | +def _init_mnnvl_memory(world_size, rank, dtype, distributed_init_port): |
| 89 | + device = torch.device(f"cuda:{rank}") |
| 90 | + torch.cuda.set_device(device) |
| 91 | + distributed_init_method = f"tcp://localhost:{distributed_init_port}" |
| 92 | + dist.init_process_group( |
| 93 | + backend="nccl", |
| 94 | + init_method=distributed_init_method, |
| 95 | + rank=rank, |
| 96 | + world_size=world_size, |
| 97 | + ) |
| 98 | + group = dist.group.WORLD |
| 99 | + |
| 100 | + torch.cuda.set_device(rank) |
| 101 | + MnnvlMemory.initialize() |
| 102 | + mapping = Mapping(world_size, rank, world_size, tp_size=world_size) |
| 103 | + |
| 104 | + allocate0_size = 4 * 1024 * 1024 - 3 * 1024 |
| 105 | + mnnvl_config = MnnvlConfig( |
| 106 | + comm_backend=CustomCommunicator(group), |
| 107 | + fabric_page_size=1 << 29, # 512MB |
| 108 | + allocation_granularity=0, # Auto-detect |
| 109 | + ) |
| 110 | + MnnvlMemory.set_comm_from_config(mapping, mnnvl_config) |
| 111 | + mnnvl_memory0 = MnnvlMemory(mapping, allocate0_size) |
| 112 | + allocate0_size_aligned = align_memory(allocate0_size) |
| 113 | + |
| 114 | + assert MnnvlMemory.current_mem_offset == allocate0_size_aligned |
| 115 | + tensor0 = mnnvl_memory0.as_torch_strided_tensor(torch.int32) |
| 116 | + numel_per_rank = allocate0_size // 4 |
| 117 | + tensor0[(rank + 1) % world_size] = torch.arange( |
| 118 | + start=rank, end=rank + numel_per_rank, device="cuda" |
| 119 | + ) |
| 120 | + dist.barrier(group=group) |
| 121 | + for r in range(world_size): |
| 122 | + torch.equal( |
| 123 | + tensor0[(r + 1) % world_size], |
| 124 | + torch.arange(start=r, end=r + numel_per_rank, device="cuda"), |
| 125 | + ) |
| 126 | + |
| 127 | + allocate1_size = 30 * 1024 * 1024 - 2 * 1024 |
| 128 | + mnnvl_memory1 = MnnvlMemory(mapping, allocate1_size) |
| 129 | + allocate1_size_aligned = align_memory(allocate1_size) |
| 130 | + assert ( |
| 131 | + MnnvlMemory.current_mem_offset |
| 132 | + == allocate0_size_aligned + allocate1_size_aligned |
| 133 | + ) |
| 134 | + tensor1 = mnnvl_memory1.as_torch_strided_tensor(torch.float32) |
| 135 | + numel_per_rank = allocate1_size // 4 |
| 136 | + tensor1[(rank + 5) % world_size] = torch.arange( |
| 137 | + start=rank, |
| 138 | + end=rank + numel_per_rank, |
| 139 | + dtype=torch.float32, |
| 140 | + device="cuda", |
| 141 | + ) |
| 142 | + dist.barrier(group=group) |
| 143 | + for r in range(world_size): |
| 144 | + torch.equal( |
| 145 | + tensor1[(r + 5) % world_size], |
| 146 | + torch.arange( |
| 147 | + start=r, end=r + numel_per_rank, dtype=torch.float32, device="cuda" |
| 148 | + ), |
| 149 | + ) |
| 150 | + dist.barrier(group=group) |
| 151 | + del tensor0, mnnvl_memory0 |
| 152 | + dist.barrier(group=group) |
| 153 | + |
| 154 | + large_allocation2_size = 768 * 1024 * 1024 |
| 155 | + large_mnnvl_memory2 = MnnvlMemory(mapping, large_allocation2_size) |
| 156 | + allocate2_size_aligned = align_memory(large_allocation2_size) |
| 157 | + assert MnnvlMemory.current_mem_offset == allocate2_size_aligned |
| 158 | + assert large_mnnvl_memory2.rank_stride == (1 << 30) |
| 159 | + |
| 160 | + del tensor1 |
| 161 | + |
| 162 | + |
| 163 | +@pytest.mark.skipif( |
| 164 | + not MnnvlMemory.supports_mnnvl(), |
| 165 | + reason="Mnnvl memory is not supported on this platform", |
| 166 | +) |
| 167 | +@pytest.mark.parametrize("world_size", [2, 4]) |
| 168 | +def test_mnnvl_custom_communicator(world_size): |
| 169 | + dtype = torch.float16 |
| 170 | + available_gpus = torch.cuda.device_count() |
| 171 | + if world_size > available_gpus: |
| 172 | + raise ValueError( |
| 173 | + f"world_size {world_size} is greater than available_gpus {available_gpus}" |
| 174 | + ) |
| 175 | + print(f"Running test for world_size={world_size}") |
| 176 | + |
| 177 | + multi_process_parallel( |
| 178 | + world_size, |
| 179 | + dtype, |
| 180 | + _init_mnnvl_memory, |
| 181 | + target_args=(), |
| 182 | + ) |
| 183 | + print(f"custom mnnvl communicator world_size = {world_size}: OK") |
0 commit comments