Skip to content

Commit 20fb78d

Browse files
committed
fix bugs for files partition running in collective mode
1 parent 8c7d113 commit 20fb78d

File tree

2 files changed

+32
-1
lines changed

2 files changed

+32
-1
lines changed

core/engine/local_cluster.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,8 @@ def start_procs(self):
119119
"PADDLE_TRAINERS_NUM": str(worker_num),
120120
"TRAINING_ROLE": "TRAINER",
121121
"PADDLE_TRAINER_ID": str(i),
122-
"FLAGS_selected_gpus": str(selected_gpus[i])
122+
"FLAGS_selected_gpus": str(selected_gpus[i]),
123+
"PADDLEREC_GPU_NUMS": str(selected_gpus_num)
123124
})
124125

125126
os.system("mkdir -p {}".format(logs_dir))

core/utils/dataloader_instance.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,16 @@ def dataloader_by_name(readerclass,
4747

4848
files.sort()
4949

50+
# for local cluster: discard some files if files cannot be divided equally between GPUs
51+
if (context["device"] == "GPU"):
52+
selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS"))
53+
discard_file_nums = len(files) % selected_gpu_nums
54+
if (discard_file_nums != 0):
55+
print(
56+
"Warning: beacause files cannot be divided equally between GPUs,discard these files:{}".
57+
format(files[-discard_file_nums:]))
58+
files = files[:len(files) - discard_file_nums]
59+
5060
need_split_files = False
5161
if context["engine"] == EngineMode.LOCAL_CLUSTER:
5262
# for local cluster: split files for multi process
@@ -109,6 +119,16 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
109119

110120
files.sort()
111121

122+
# for local cluster: discard some files if files cannot be divided equally between GPUs
123+
if (context["device"] == "GPU"):
124+
selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS"))
125+
discard_file_nums = len(files) % selected_gpu_nums
126+
if (discard_file_nums != 0):
127+
print(
128+
"Warning: beacause files cannot be divided equally between GPUs, discard these files:{}".
129+
format(files[-discard_file_nums:]))
130+
files = files[:len(files) - discard_file_nums]
131+
112132
need_split_files = False
113133
if context["engine"] == EngineMode.LOCAL_CLUSTER:
114134
# for local cluster: split files for multi process
@@ -179,6 +199,16 @@ def slotdataloader(readerclass, train, yaml_file, context):
179199

180200
files.sort()
181201

202+
# for local cluster: discard some files if files cannot be divided equally between GPUs
203+
if (context["device"] == "GPU"):
204+
selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS"))
205+
discard_file_nums = len(files) % selected_gpu_nums
206+
if (discard_file_nums != 0):
207+
print(
208+
"Warning: beacause files cannot be divided equally between GPUs,discard these files:{}".
209+
format(files[-discard_file_nums:]))
210+
files = files[:len(files) - discard_file_nums]
211+
182212
need_split_files = False
183213
if context["engine"] == EngineMode.LOCAL_CLUSTER:
184214
# for local cluster: split files for multi process

0 commit comments

Comments
 (0)