Skip to content

Commit b7febe8

Browse files
authored
added regenerated data processing for llama series (#396)
1 parent 866ca44 commit b7febe8

File tree

1 file changed

+42
-8
lines changed

1 file changed

+42
-8
lines changed

scripts/prepare_data.py

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ def parse_args():
3939
"sharegpt",
4040
"eaglechat",
4141
"perfectblend",
42+
"perfectblend-llama3.1-8b-instruct",
43+
"perfectblend-llama3.3-70b-instruct",
44+
"perfectblend-llama4-scout-instruct",
45+
"perfectblend-llama4-maverick-instruct",
4246
"magpie-qwen2.5-pro-1m-v0.1",
4347
"sharegpt4v",
4448
"allava4v",
@@ -189,20 +193,26 @@ def process_and_save_ds(train_ds, test_ds, output_path, proc_fn, dataset_name):
189193
total_skipped_count = 0
190194
with open(train_output_jsonl_path, "w") as f:
191195
for item in tqdm(train_ds, desc=f"Processing {dataset_name} dataset"):
192-
row, skipped_count = proc_fn(item)
193-
if row is None:
194-
continue
195-
total_skipped_count += skipped_count
196+
if proc_fn is not None:
197+
row, skipped_count = proc_fn(item)
198+
if row is None:
199+
continue
200+
total_skipped_count += skipped_count
201+
else:
202+
row = item
196203
f.write(json.dumps(row, ensure_ascii=False) + "\n")
197204

198205
if test_ds is not None:
199206
test_output_jsonl_path = output_path.joinpath(f"{dataset_name}_test.jsonl")
200207
with open(test_output_jsonl_path, "w") as f:
201208
for item in tqdm(test_ds, desc=f"Processing {dataset_name} test dataset"):
202-
row, skipped_count = proc_fn(item)
203-
if row is None:
204-
continue
205-
total_skipped_count += skipped_count
209+
if proc_fn is not None:
210+
row, skipped_count = proc_fn(item)
211+
if row is None:
212+
continue
213+
total_skipped_count += skipped_count
214+
else:
215+
row = item
206216
f.write(json.dumps(row, ensure_ascii=False) + "\n")
207217

208218
if total_skipped_count > 0:
@@ -252,6 +262,30 @@ def main():
252262
ds = load_dataset("mlabonne/open-perfectblend")["train"]
253263
ds = ds.map(add_index, with_indices=True)
254264
proc_fn = process_sharegpt_row
265+
elif args.dataset == "perfectblend-llama3.1-8b-instruct":
266+
ds = load_dataset("frankleeeee/PerfectBlend-Regenerated-Llama-3.1-8B-Instruct")[
267+
"train"
268+
]
269+
ds = ds.map(add_index, with_indices=True)
270+
proc_fn = None
271+
elif args.dataset == "perfectblend-llama3.3-70b-instruct":
272+
ds = load_dataset(
273+
"frankleeeee/PerfectBlend-Regenerated-Llama-3.3-70B-Instruct"
274+
)["train"]
275+
ds = ds.map(add_index, with_indices=True)
276+
proc_fn = None
277+
elif args.dataset == "perfectblend-llama4-scout-instruct":
278+
ds = load_dataset(
279+
"frankleeeee/PerfectBlend-Regenerated-Llama-4-Scout-17B-16E-Instruct"
280+
)["train"]
281+
ds = ds.map(add_index, with_indices=True)
282+
proc_fn = None
283+
elif args.dataset == "perfectblend-llama4-maverick-instruct":
284+
ds = load_dataset(
285+
"frankleeeee/PerfectBlend-Regenerated-Llama-4-Maverick-17B-128E-Instruct"
286+
)["train"]
287+
ds = ds.map(add_index, with_indices=True)
288+
proc_fn = None
255289
elif args.dataset == "magpie-qwen2.5-pro-1m-v0.1":
256290
ds = load_dataset("Magpie-Align/Magpie-Qwen2.5-Pro-1M-v0.1")["train"]
257291
ds = ds.rename_column("uuid", "id")

0 commit comments

Comments
 (0)