Skip to content

Commit 668b994

Browse files
committed
[Feat] convert sft dataset in chat template style
1 parent ff067f2 commit 668b994

File tree

1 file changed

+40
-0
lines changed

1 file changed

+40
-0
lines changed

data/generate_sft_verl.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import os
2+
import argparse
3+
from datasets import Dataset, load_dataset
4+
from tqdm import tqdm
5+
6+
7+
def make_map_fn(split):
8+
def process_fn(example, idx):
9+
return {
10+
"data_source": "openmanus-rl",
11+
"prompt": example['conversations'],
12+
"ability": "instruction-following",
13+
"reward_model": {
14+
"style": "none",
15+
"ground_truth": None
16+
},
17+
"extra_info": {
18+
"split": split,
19+
"index": idx,
20+
"id": example['id']
21+
}
22+
}
23+
return process_fn
24+
25+
26+
if __name__ == '__main__':
27+
parser = argparse.ArgumentParser()
28+
parser.add_argument('--output_dir', required=True, help="Output directory for processed parquet")
29+
parser.add_argument('--split', type=str, default="train")
30+
31+
args = parser.parse_args()
32+
33+
# Load from Hugging Face Hub
34+
dataset = load_dataset("CharlieDreemur/OpenManus-RL", split=args.split)
35+
36+
# Apply mapping to Verl format
37+
dataset = dataset.map(function=make_map_fn(args.split), with_indices=True)
38+
39+
os.makedirs(args.output_dir, exist_ok=True)
40+
dataset.to_parquet(os.path.join(args.output_dir, f"{args.split}.parquet"))

0 commit comments

Comments
 (0)