Skip to content

Commit 6ef7687

Browse files
authored
support for paddle (#16)
support for paddle support gloo in cpu case try nccl in gpu case update test case fix the bug in nccl support gds fix merge bug add paddle unittest confuse about test_memmove fix uint16 add the paddlepaddle-gpu to project.toml paddlepaddle set 3.0 version paddlepaddle-gpu==3.0.0 should install from www.paddlepaddle.org.cn remove cpp.cpythonxxx.so add paddle_loaded Signed-off-by: Takeshi Yoshimura <tyos@jp.ibm.com> Signed-off-by: zeroRains <linjunlu@zerorains.top>
1 parent bfbcc73 commit 6ef7687

21 files changed

+550
-147
lines changed

.github/workflows/test-paddle.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,4 @@ jobs:
5555
uses: actions/upload-artifact@v4
5656
with:
5757
name: pytest-log-paddle-${{ matrix.python-version }}
58-
path: /tmp/pytest-log
58+
path: /tmp/pytest-log

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,8 @@ dist/
55
htmlcov/
66
.coverage
77
.coverage_*
8-
.pytest_cache/
8+
.pytest_cache/
9+
.vscode
10+
*.log
11+
*.pyc
12+
examples/paddle_case/log
360 Bytes
Binary file not shown.
360 Bytes
Binary file not shown.

examples/paddle_case/gen.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import os
2+
import paddle
3+
t0 = paddle.concat([paddle.full((1,8), i, dtype=paddle.float16) for i in range(0, 16)], dim=0)
4+
from safetensors.paddle import save_file
5+
for file_prefix in ["a", "b"]:
6+
save_file({f"{file_prefix}0": t0}, f"{file_prefix}_paddle.safetensors", metadata={"fst": "sample"})
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# !/usr/bin/env python3
2+
# PIDS=()
3+
4+
# runner="python -m paddle.distributed.launch"
5+
6+
# cd paddle_case
7+
# ${runner} --nnodes=2 --master=127.0.0.1:12345 --rank=0 run_parallel.py &
8+
# PIDS+=($!)
9+
# ${runner} --nnodes=2 --master=127.0.0.1:12345 --rank=1 run_parallel.py &
10+
# PIDS+=($!)
11+
# wait "${PIDS[@]}"
12+
13+
import paddle
14+
import paddle.distributed as dist
15+
from fastsafetensors import SafeTensorsFileLoader
16+
dist.init_parallel_env()
17+
backend = "nccl" if paddle.is_compiled_with_cuda() else "gloo"
18+
pg = dist.new_group(ranks=[0,1], backend=backend)
19+
device = "gpu:0" if paddle.is_compiled_with_cuda() else "cpu"
20+
loader = SafeTensorsFileLoader(pg, device, nogds=False, debug_log=True, framework="paddle")
21+
loader.add_filenames({0: ["a_paddle.safetensors"], 1:["b_paddle.safetensors"]}) # {rank: files}
22+
23+
# load a.safetensors to rank 0 GPU and b.safetensors to rank 1 GPU
24+
fb = loader.copy_files_to_device()
25+
26+
# every rank must call get_tensor and get_sharded in the same order since they internally call paddle.distributed collective ops
27+
tensor_a0 = fb.get_tensor(tensor_name="a0") # broadcast
28+
tensor_b0_sharded = fb.get_sharded(tensor_name="b0", dim=1) # partition and scatter
29+
print(f"RANK {pg.process_group.rank()}: tensor_a0={tensor_a0}")
30+
print(f"RANK {pg.process_group.rank()}: tensor_b0_sharded={tensor_b0_sharded}")
31+
fb.close()
32+
loader.close()

examples/paddle_case/run_single.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import paddle
2+
from fastsafetensors import SafeTensorsFileLoader, SingleGroup
3+
device = "gpu:0" if paddle.is_compiled_with_cuda() else "cpu"
4+
loader = SafeTensorsFileLoader(SingleGroup(), device, nogds=False, debug_log=True, framework="paddle")
5+
loader.add_filenames({0: ["a_paddle.safetensors", "b_paddle.safetensors"]}) # {rank: files}
6+
fb = loader.copy_files_to_device()
7+
tensor_a0 = fb.get_tensor(tensor_name="a0")
8+
tensor_b0 = fb.get_tensor(tensor_name="b0")
9+
print(f"a0: {tensor_a0}")
10+
mycat = paddle.concat([tensor_a0, tensor_b0], axis=1)
11+
print(f"cat: {mycat}, size={mycat.size}")
12+
fb.close()
13+
loader.close()

examples/run_paddle_parrallel.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# !/usr/bin/env python3
2+
PIDS=()
3+
4+
runner="python -m paddle.distributed.launch"
5+
# runner="torchrun"
6+
7+
cd paddle_case
8+
rm -rf log
9+
# one machine multy gpu (case : 1 machine 2 gpus)
10+
# different to torch script because the paddle distributed use nccl to communicate in gpus
11+
CUDA_VISIBLE_DEVICES=0 ${runner} --nnodes=2 --master=127.0.0.1:8800 --rank=0 run_parallel.py &
12+
PIDS+=($!)
13+
CUDA_VISIBLE_DEVICES=1 ${runner} --nnodes=2 --master=127.0.0.1:8800 --rank=1 run_parallel.py &
14+
PIDS+=($!)
15+
wait "${PIDS[@]}"

examples/run_parallel.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
dist.barrier()
1515
pg = dist.group.WORLD
1616
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
17-
loader = SafeTensorsFileLoader(pg, device, nogds=True, debug_log=True)
17+
loader = SafeTensorsFileLoader(pg, device, nogds=False, debug_log=True)
1818
loader.add_filenames({0: ["a.safetensors"], 1:["b.safetensors"]}) # {rank: files}
1919

2020
# load a.safetensors to rank 0 GPU and b.safetensors to rank 1 GPU

examples/run_reuse_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
sys.path.insert(0, "/nvme/manish/repos/fastsafetensors/fastsafetensors")
66

77
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
8-
loader = SafeTensorsFileLoader(SingleGroup(), device)#, nogds=True, debug_log=True)
8+
loader = SafeTensorsFileLoader(SingleGroup(), device, nogds=True, debug_log=True)
99

1010
loader.add_filenames({0: ["a.safetensors"]}) # {rank: files}
1111
fb = loader.copy_files_to_device()

0 commit comments

Comments
 (0)