@@ -39,6 +39,10 @@ def parse_args():
3939 "sharegpt" ,
4040 "eaglechat" ,
4141 "perfectblend" ,
42+ "perfectblend-llama3.1-8b-instruct" ,
43+ "perfectblend-llama3.3-70b-instruct" ,
44+ "perfectblend-llama4-scout-instruct" ,
45+ "perfectblend-llama4-maverick-instruct" ,
4246 "magpie-qwen2.5-pro-1m-v0.1" ,
4347 "sharegpt4v" ,
4448 "allava4v" ,
@@ -189,20 +193,26 @@ def process_and_save_ds(train_ds, test_ds, output_path, proc_fn, dataset_name):
189193 total_skipped_count = 0
190194 with open (train_output_jsonl_path , "w" ) as f :
191195 for item in tqdm (train_ds , desc = f"Processing { dataset_name } dataset" ):
192- row , skipped_count = proc_fn (item )
193- if row is None :
194- continue
195- total_skipped_count += skipped_count
196+ if proc_fn is not None :
197+ row , skipped_count = proc_fn (item )
198+ if row is None :
199+ continue
200+ total_skipped_count += skipped_count
201+ else :
202+ row = item
196203 f .write (json .dumps (row , ensure_ascii = False ) + "\n " )
197204
198205 if test_ds is not None :
199206 test_output_jsonl_path = output_path .joinpath (f"{ dataset_name } _test.jsonl" )
200207 with open (test_output_jsonl_path , "w" ) as f :
201208 for item in tqdm (test_ds , desc = f"Processing { dataset_name } test dataset" ):
202- row , skipped_count = proc_fn (item )
203- if row is None :
204- continue
205- total_skipped_count += skipped_count
209+ if proc_fn is not None :
210+ row , skipped_count = proc_fn (item )
211+ if row is None :
212+ continue
213+ total_skipped_count += skipped_count
214+ else :
215+ row = item
206216 f .write (json .dumps (row , ensure_ascii = False ) + "\n " )
207217
208218 if total_skipped_count > 0 :
@@ -252,6 +262,30 @@ def main():
252262 ds = load_dataset ("mlabonne/open-perfectblend" )["train" ]
253263 ds = ds .map (add_index , with_indices = True )
254264 proc_fn = process_sharegpt_row
265+ elif args .dataset == "perfectblend-llama3.1-8b-instruct" :
266+ ds = load_dataset ("frankleeeee/PerfectBlend-Regenerated-Llama-3.1-8B-Instruct" )[
267+ "train"
268+ ]
269+ ds = ds .map (add_index , with_indices = True )
270+ proc_fn = None
271+ elif args .dataset == "perfectblend-llama3.3-70b-instruct" :
272+ ds = load_dataset (
273+ "frankleeeee/PerfectBlend-Regenerated-Llama-3.3-70B-Instruct"
274+ )["train" ]
275+ ds = ds .map (add_index , with_indices = True )
276+ proc_fn = None
277+ elif args .dataset == "perfectblend-llama4-scout-instruct" :
278+ ds = load_dataset (
279+ "frankleeeee/PerfectBlend-Regenerated-Llama-4-Scout-17B-16E-Instruct"
280+ )["train" ]
281+ ds = ds .map (add_index , with_indices = True )
282+ proc_fn = None
283+ elif args .dataset == "perfectblend-llama4-maverick-instruct" :
284+ ds = load_dataset (
285+ "frankleeeee/PerfectBlend-Regenerated-Llama-4-Maverick-17B-128E-Instruct"
286+ )["train" ]
287+ ds = ds .map (add_index , with_indices = True )
288+ proc_fn = None
255289 elif args .dataset == "magpie-qwen2.5-pro-1m-v0.1" :
256290 ds = load_dataset ("Magpie-Align/Magpie-Qwen2.5-Pro-1M-v0.1" )["train" ]
257291 ds = ds .rename_column ("uuid" , "id" )
0 commit comments