forked from foundation-model-stack/fastsafetensors
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_parallel.py
More file actions
32 lines (27 loc) · 1.3 KB
/
run_parallel.py
File metadata and controls
32 lines (27 loc) · 1.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# !/usr/bin/env python3
# PIDS=()
# runner="python -m paddle.distributed.launch"
# cd paddle_case
# ${runner} --nnodes=2 --master=127.0.0.1:12345 --rank=0 run_parallel.py &
# PIDS+=($!)
# ${runner} --nnodes=2 --master=127.0.0.1:12345 --rank=1 run_parallel.py &
# PIDS+=($!)
# wait "${PIDS[@]}"
import paddle
import paddle.distributed as dist
from fastsafetensors import SafeTensorsFileLoader
dist.init_parallel_env()
backend = "nccl" if paddle.device.cuda.device_count() else "gloo"
pg = dist.new_group(ranks=[0,1], backend=backend)
device = "gpu" if paddle.device.cuda.device_count() else "cpu"
loader = SafeTensorsFileLoader(pg, device, nogds=False, debug_log=True, framework="paddle")
loader.add_filenames({0: ["a_paddle.safetensors"], 1:["b_paddle.safetensors"]}) # {rank: files}
# load a.safetensors to rank 0 GPU and b.safetensors to rank 1 GPU
fb = loader.copy_files_to_device()
# every rank must call get_tensor and get_sharded in the same order since they internally call paddle.distributed collective ops
tensor_a0 = fb.get_tensor(tensor_name="a0") # broadcast
tensor_b0_sharded = fb.get_sharded(tensor_name="b0", dim=1) # partition and scatter
print(f"RANK {pg.process_group.rank()}: tensor_a0={tensor_a0}")
print(f"RANK {pg.process_group.rank()}: tensor_b0_sharded={tensor_b0_sharded}")
fb.close()
loader.close()