Skip to content

Commit 18391ca

Browse files
zeroRainszeroRains
andauthored
fix the distributed load for paddle (#19)
fix the distributed load for paddle remove useless file make sure the device id does not exceed the device count Signed-off-by: zeroRains <linjunlu@zerorains.top> Co-authored-by: zeroRains <zerorainssakurar@qq.com>
1 parent 6ef7687 commit 18391ca

File tree

11 files changed

+115
-25
lines changed

11 files changed

+115
-25
lines changed

.github/workflows/test-paddle.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ jobs:
4747
COVERAGE_FILE=.coverage_1 CUDA_VISIBLE_DEVICES="" pytest -s --cov=${LIBDIR} test_fastsafetensors.py > /tmp/pytest-log/1.log 2>&1
4848
COVERAGE_FILE=.coverage_2 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=0 --no-python pytest -s --cov=${LIBDIR} test_multi.py > /tmp/pytest-log/2.log 2>&1 &
4949
COVERAGE_FILE=.coverage_3 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=1 --no-python pytest -s --cov=${LIBDIR} test_multi.py > /tmp/pytest-log/3.log 2>&1
50-
coverage combine .coverage_0 .coverage_1 .coverage_2 .coverage_3
50+
python -m paddle.distributed.launch --nproc_per_node 2 run_distributed_paddle_test.py -s --cov=${LIBDIR} test_multi_paddle.py
51+
coverage combine .coverage_0 .coverage_1 .coverage_2 .coverage_3 .coverage_4 .coverage_5
5152
coverage html
5253
mv htmlcov /tmp/pytest-log
5354
- name: upload pytest log

examples/paddle_case/run_parallel.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
import paddle.distributed as dist
1515
from fastsafetensors import SafeTensorsFileLoader
1616
dist.init_parallel_env()
17-
backend = "nccl" if paddle.is_compiled_with_cuda() else "gloo"
17+
backend = "nccl" if paddle.device.cuda.device_count() else "gloo"
1818
pg = dist.new_group(ranks=[0,1], backend=backend)
19-
device = "gpu:0" if paddle.is_compiled_with_cuda() else "cpu"
19+
device = "gpu" if paddle.device.cuda.device_count() else "cpu"
2020
loader = SafeTensorsFileLoader(pg, device, nogds=False, debug_log=True, framework="paddle")
2121
loader.add_filenames({0: ["a_paddle.safetensors"], 1:["b_paddle.safetensors"]}) # {rank: files}
2222

examples/paddle_case/run_single.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import paddle
22
from fastsafetensors import SafeTensorsFileLoader, SingleGroup
3-
device = "gpu:0" if paddle.is_compiled_with_cuda() else "cpu"
3+
device = "gpu:0" if paddle.device.cuda.device_count() else "cpu"
44
loader = SafeTensorsFileLoader(SingleGroup(), device, nogds=False, debug_log=True, framework="paddle")
55
loader.add_filenames({0: ["a_paddle.safetensors", "b_paddle.safetensors"]}) # {rank: files}
66
fb = loader.copy_files_to_device()
77
tensor_a0 = fb.get_tensor(tensor_name="a0")
88
tensor_b0 = fb.get_tensor(tensor_name="b0")
9-
print(f"a0: {tensor_a0}")
10-
mycat = paddle.concat([tensor_a0, tensor_b0], axis=1)
9+
print(f"a0: {tensor_a0}\n b0 : {tensor_b0}")
10+
mycat = paddle.concat([tensor_a0, tensor_b0])
1111
print(f"cat: {mycat}, size={mycat.size}")
1212
fb.close()
1313
loader.close()
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# !/usr/bin/env python3
2+
PIDS=()
3+
4+
runner="python -m paddle.distributed.launch"
5+
6+
cd paddle_case
7+
rm -rf log
8+
# It can only be used on the CPU version of paddlepaddle
9+
${runner} --nproc_per_node 2 run_parallel.py
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# !/usr/bin/env python3
2+
PIDS=()
3+
4+
runner="python -m paddle.distributed.launch"
5+
6+
cd paddle_case
7+
rm -rf log
8+
# It can only be used on the GPU version of paddlepaddle-gpu
9+
# A machine multy gpu (case : 1 machine 2 gpus)
10+
# Different to torch script because the paddle distributed use nccl to communicate in gpus
11+
CUDA_VISIBLE_DEVICES=0,1 ${runner} --gpus 0,1 run_parallel.py

examples/run_paddle_parrallel.sh

Lines changed: 0 additions & 15 deletions
This file was deleted.

fastsafetensors/loader.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,17 @@ def __init__(self, pg: dist.ProcessGroup, device: torch.device, bbuf_size_kb: in
7171
if device == "cpu":
7272
d_id = None
7373
else:
74-
d_id = device.split(":") # "gpu:0" or "gpu"
75-
d_id = int(d_id[1]) if len(d_id) == 2 else 0
74+
if isinstance(self.pg, SingleGroup):
75+
# For single (gpu:x, gpu)
76+
# gpu:x, like gpu:0, gpu:1, ...
77+
d_id = device.split(":")
78+
d_id = int(d_id[1]) if len(d_id) == 2 else 0
79+
else:
80+
# For distributed
81+
# The gpu determines the current rank
82+
# rank0 use gpu:0, rank1 use gpu:1
83+
d_id = self.pg.rank() % paddle.device.cuda.device_count()
84+
self.device = f"gpu:{d_id}"
7685
node = get_device_numa_node(d_id)
7786
if node is not None:
7887
fstcpp.set_numa_node(node)
@@ -140,7 +149,7 @@ def copy_files_to_device(self, dtype: torch.dtype=None, use_buf_register: bool=T
140149
if self.device.type != "cpu":
141150
torch.cuda.set_device(self.device)
142151
elif paddle_loaded and self.framework == "paddle":
143-
if self.device != paddle.CPUPlace():
152+
if "gpu" in self.device:
144153
paddle.set_device(self.device)
145154

146155
need_wait: List[LazyTensorFactory] = []

tests/conftest.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import torch.distributed as dist
55
from fastsafetensors import cpp as fstcpp
66
from fastsafetensors import SingleGroup
7+
from fastsafetensors.common import paddle_loaded
78
from typing import List
89

910
TESTS_DIR = os.path.dirname(__file__)
@@ -39,6 +40,24 @@ def pg():
3940
PG = SingleGroup()
4041
return PG
4142

43+
@pytest.fixture(scope='session', autouse=True)
44+
def pg_paddle():
45+
PG = SingleGroup()
46+
47+
if paddle_loaded:
48+
# The following code can only be successfully
49+
# executed by running the code using
50+
# `python -m paddle.distributed.launch`
51+
try:
52+
import paddle
53+
import paddle.distributed as dist
54+
dist.init_parallel_env()
55+
backend = "nccl" if paddle.device.cuda.device_count() else "gloo"
56+
PG = dist.new_group(ranks=[0,1], backend=backend)
57+
except:
58+
pass
59+
return PG
60+
4261
@pytest.fixture(scope='session', autouse=True)
4362
def dev_init() -> None:
4463
if torch.cuda.is_available():
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import pytest
2+
import sys
3+
import os
4+
5+
if __name__ == "__main__":
6+
# There are 4 commands before this test
7+
# GPU ditributed need at least 2 GPU
8+
rank = int(os.getenv("PADDLE_TRAINER_ID")) + 4
9+
os.environ["COVERAGE_FILE"] = f".coverage_{rank}"
10+
pytest_args = sys.argv[1:]
11+
sys.exit(pytest.main(pytest_args))

tests/test_fastsafetensors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ def test_SafeTensorsFileLoader(fstcpp_log, input_files, framework="pytorch"):
228228
if framework == "pytorch":
229229
data_type = torch.float16
230230
elif framework == "paddle":
231-
# There are some lack of accuracy in paddle.float16 (about 1e-4)
231+
# There are some lack of accuracy in paddle.float16 (about 1e-4) in cpu.
232232
data_type = paddle.float32
233233
else:
234234
raise NotImplementedError(f"Do not support the framework: {framework}")

0 commit comments

Comments
 (0)