Skip to content

Commit 25ff2d4

Browse files
realtmxiKunlun-Zhu
andauthored
sft & eval script (#68)
* [feat]: offline rollout evaluation * [feat]: sync with verl/utils/dataset/ lastest code * feat: update requirements.txt * [feat]: extract the webshop testset from AgentEval dataset * feat: run_sft script * feat: webshop evaluation script --------- Co-authored-by: Kunlun Zhu <[email protected]>
1 parent ba5d504 commit 25ff2d4

File tree

13 files changed

+1306
-110
lines changed

13 files changed

+1306
-110
lines changed

data/webshop/webshop.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import json
2+
from datasets import load_dataset
3+
4+
# Load the full AgentEval dataset
5+
ds = load_dataset("AgentGym/AgentEval", split="test")
6+
7+
# Filter only the entries with item_id starting with "webshop_"
8+
webshop_ds = ds.filter(lambda x: x["item_id"].startswith("webshop_"))
9+
10+
# Preview the result
11+
print(webshop_ds)
12+
13+
output_file = "webshop_inference.json"
14+
15+
data = [{"item_id": x["item_id"], "conversations": []} for x in webshop_ds]
16+
17+
with open(output_file, "w") as f:
18+
json.dump(data, f, indent=2)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Evaluation args
2+
model_path="/data1/models/openmanus_rl/Qwen/Qwen3-3b-sft/global_step_1"
3+
inference_file="/home/user/muxin/OpenManus-RL/data/webshop/webshop_inference.json"
4+
output_file="/data1/models/openmanus_rl/Qwen/Qwen3-3b-sft/output/qwen2.5-3b-webshop.log"
5+
task_name="webshop"
6+
seed="42"
7+
8+
# environment parameters
9+
max_round="6"
10+
env_server_base="http://127.0.0.1:36001"
11+
12+
python -u base_eval_template.py \
13+
--model_path "${model_path}" \
14+
--inference_file "${inference_file}" \
15+
--output_file "${output_file}" \
16+
--task_name "${task_name}" \
17+
--seed "${seed}" \
18+
--max_round "${max_round}" \
19+
--env_server_base "${env_server_base}"
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
exp_name="eval_webshop"
2+
inference_file='/home/user/muxin/OpenManus-RL/data/webshop/webshop_inference.json' # Path to the trainset file which contains idxs for the task.
3+
4+
num_processes='8'
5+
main_process_port='8877'
6+
weight_decay="0"
7+
8+
### Default variables
9+
task_name="webshop" # change this to evaluate on a different task
10+
output_dir="/data1/models/openmanus_rl/Qwen/Qwen3-3b-sft/output"
11+
12+
# agent model
13+
#model_path="/data1/models/openmanus_rl/Qwen/Qwen3-3b-sft/global_step_1"
14+
model_path="/data1/models/Qwen/Qwen2.5-3B"
15+
eval_batch_size="1"
16+
num_workers="8"
17+
seed="42"
18+
do_sample="False"
19+
temperature="1.0"
20+
21+
max_round="6"
22+
env_server_base="http://127.0.0.1:36001" # Set this to the base url of the EnvServer.
23+
timeout="2400"
24+
25+
26+
#########
27+
mkdir -p "${output_dir}"
28+
export PYTHONPATH=/home/user/muxin/OpenManus-RL/openmanus_rl/agentgym/agentenv:$PYTHONPATH # You need to modify this as your agentgym/agentenv absolute path
29+
30+
accelerate launch \
31+
--num_processes=${num_processes} \
32+
--main_process_port=${main_process_port} \
33+
../../utils/distributed_eval_task.py \
34+
--model_path "${model_path}" \
35+
--output_file "${output_dir}/inference.jsonl" \
36+
--inference_file "${inference_file}" \
37+
--task_name "${task_name}" \
38+
--eval_batch_size "${eval_batch_size}" \
39+
--num_workers "${num_workers}" \
40+
--seed "${seed}" \
41+
--do_sample "${do_sample}" \
42+
--temperature "${temperature}" \
43+
--max_round "${max_round}" \
44+
--env_server_base "${env_server_base}" \
45+
--data_len 200 \
46+
--timeout "${timeout}"

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ vllm<=0.6.3
1414
wandb
1515
IPython
1616
matplotlib
17-
omegaconf
17+
omegaconf

scripts/offline_rollout.sh

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
CONFIG_FILE="" # fulfill the config yaml file here
2+
MODEL_PATH=""
3+
OUTPUT_DIR=""
4+
TASK_NAMES=""
5+
DATA_LEN=200
6+
TIMEOUT=2400
7+
DO_SAMPLE="False"
8+
TEMPERATURE=1.0
9+
SEED=42
10+
DEBUG=false
11+
12+
# Parse command line arguments
13+
while [[ $# -gt 0 ]]; do
14+
case $1 in
15+
--config)
16+
CONFIG_FILE="$2"
17+
shift 2
18+
;;
19+
--model_path)
20+
MODEL_PATH="$2"
21+
shift 2
22+
;;
23+
--output_dir)
24+
OUTPUT_DIR="$2"
25+
shift 2
26+
;;
27+
--task_names)
28+
TASK_NAMES="$2"
29+
shift 2
30+
;;
31+
--data_len)
32+
DATA_LEN="$2"
33+
shift 2
34+
;;
35+
--timeout)
36+
TIMEOUT="$2"
37+
shift 2
38+
;;
39+
--do_sample)
40+
DO_SAMPLE="$2"
41+
shift 2
42+
;;
43+
--temperature)
44+
TEMPERATURE="$2"
45+
shift 2
46+
;;
47+
--seed)
48+
SEED="$2"
49+
shift 2
50+
;;
51+
--debug)
52+
DEBUG=true
53+
shift
54+
;;
55+
*)
56+
echo "Unknown option: $1"
57+
exit 1
58+
;;
59+
esac
60+
done
61+
62+
# Build command
63+
CMD="python traj_generation/rollout_eval.py --config $CONFIG_FILE"
64+
65+
if [ ! -z "$MODEL_PATH" ]; then
66+
CMD="$CMD --model_path $MODEL_PATH"
67+
fi
68+
69+
if [ ! -z "$OUTPUT_DIR" ]; then
70+
CMD="$CMD --output_dir $OUTPUT_DIR"
71+
fi
72+
73+
if [ ! -z "$TASK_NAMES" ]; then
74+
CMD="$CMD --task_names $TASK_NAMES"
75+
fi
76+
77+
CMD="$CMD --data_len $DATA_LEN --timeout $TIMEOUT --do_sample $DO_SAMPLE --temperature $TEMPERATURE --seed $SEED"
78+
79+
if [ "$DEBUG" = true ]; then
80+
CMD="$CMD --debug"
81+
fi
82+
83+
# Create log directory
84+
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
85+
LOG_DIR="./logs"
86+
mkdir -p $LOG_DIR
87+
LOG_FILE="$LOG_DIR/offline_rollout_$TIMESTAMP.log"
88+
89+
# Print the command
90+
echo "Running: $CMD"
91+
echo "Logging to: $LOG_FILE"
92+
93+
# Execute with logging
94+
eval "$CMD | tee $LOG_FILE"
95+
96+
echo "Evaluation complete! Results saved to the output directory."

scripts/run_sft.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/bash
2+
set -x
3+
4+
if [ "$#" -lt 2 ]; then
5+
echo "Usage: run_sft.sh <nproc_per_node> <save_path> [other_configs...]"
6+
exit 1
7+
fi
8+
9+
nproc_per_node=$1
10+
save_path=$2
11+
12+
# Shift the arguments so $@ refers to the rest
13+
shift 2
14+
15+
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
16+
-m verl.trainer.fsdp_sft_trainer \
17+
data.train_files=OpenManus-RL/data/train_split.parquet \
18+
data.val_files=OpenManus-RL/data/test_split.parquet \
19+
data.multiturn.enable=true \
20+
data.multiturn.messages_key=prompt \
21+
data.micro_batch_size=4 \
22+
model.partial_pretrain=/data1/models/Qwen/Qwen3-4B \
23+
trainer.default_local_dir=$save_path \
24+
trainer.project_name=multiturn-sft \
25+
trainer.experiment_name=multiturn-sft-qwen-3-4b \
26+
trainer.logger=['console'] \
27+
trainer.total_training_steps=1 \
28+
trainer.default_hdfs_dir=null $@ \
29+
ulysses_sequence_parallel_size=2 \
30+
use_remove_padding=true

0 commit comments

Comments
 (0)