Skip to content

Commit 01934d2

Browse files
authored
Merge pull request #3 from atasoglu/develop
fix: assign sequential IDs and write records in order during parallel…
2 parents 964b8c9 + 210aedc commit 01934d2

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

src/toolsgen/core/parallel.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ def generate_records_parallel(
133133

134134
results_by_index: Dict[int, Record] = {}
135135
failed = 0
136+
next_id_to_write = 0
136137

137138
ctx = mp.get_context("spawn")
138139
with ProcessPoolExecutor(
@@ -156,6 +157,13 @@ def generate_records_parallel(
156157
if sample_result.record:
157158
record = Record.model_validate(sample_result.record)
158159
results_by_index[sample_result.sample_index] = record
160+
161+
while next_id_to_write in results_by_index:
162+
rec = results_by_index[next_id_to_write]
163+
rec.id = f"record_{next_id_to_write:06d}"
164+
append_record_jsonl(rec, jsonl_path)
165+
del results_by_index[next_id_to_write]
166+
next_id_to_write += 1
159167
else:
160168
tqdm.write(
161169
"Warning: Failed to generate sample "
@@ -170,8 +178,4 @@ def generate_records_parallel(
170178
pbar.update(1)
171179

172180
all_records = [results_by_index[i] for i in sorted(results_by_index.keys())]
173-
for idx, record in enumerate(all_records):
174-
record.id = f"record_{idx:06d}"
175-
append_record_jsonl(record, jsonl_path)
176-
177181
return all_records, failed

0 commit comments

Comments
 (0)