Skip to content

Commit bfca279

Browse files
committed
update png
1 parent 99bff6d commit bfca279

File tree

6 files changed

+27
-2
lines changed

6 files changed

+27
-2
lines changed
-15.8 KB
Loading
464 KB
Loading
-50.4 KB
Loading

docs/sphinx_doc/source/tutorial/example_search_email.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,5 +48,6 @@ The results are shown in the following figure (the accuracy ranges from -0.1 to
4848

4949
![](../../assets/email_rollout_accuracy.png)
5050

51+
![](../../assets/email_reward_mean.png)
5152

5253
![](../../assets/email_eval_accuracy.png)

docs/sphinx_doc/source_zh/tutorial/example_search_email.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,6 @@ trinity run --config examples/grpo_email_search/email_search.yaml
4444

4545
![](../../assets/email_rollout_accuracy.png)
4646

47+
![](../../assets/email_reward_mean.png)
48+
4749
![](../../assets/email_eval_accuracy.png)

examples/grpo_email_search/email_search.yaml

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,20 @@ algorithm:
66
repeat_times: 8
77
optimizer:
88
lr: 1e-6
9+
policy_loss_fn: "rec"
10+
policy_loss_fn_args:
11+
epsilon_low: 0.2
12+
epsilon_high: 0.2
13+
clip_mode: "one-side"
14+
weight: "none"
15+
temp: 1.0
16+
regularizer: "none"
17+
regularizer_coef: 0.0
18+
kl_loss_fn: 'k2'
19+
kl_loss_fn_args:
20+
kl_coef: 0.0
21+
advantage_fn_args:
22+
std_cal_level: 'batch'
923
model:
1024
model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen3-4B-Instruct-2507}
1125
max_response_tokens: 4096
@@ -15,8 +29,8 @@ cluster:
1529
gpu_per_node: 8
1630
buffer:
1731
total_epochs: 1
18-
batch_size: 16
19-
train_batch_size: 640 # 16*8*5
32+
batch_size: 64
33+
train_batch_size: 2560 # 64*8*5
2034
explorer_input:
2135
taskset:
2236
name: enron_train
@@ -56,6 +70,12 @@ buffer:
5670
storage_type: queue
5771
replay_buffer:
5872
enable: true
73+
# reuse_cooldown_time is None
74+
priority_fn: 'decay_limit_randomization'
75+
priority_fn_args:
76+
decay: 2.0
77+
use_count_limit: 3
78+
sigma: 2.0
5979
explorer:
6080
eval_interval: 10
6181
max_repeat_times_per_runner: 1
@@ -93,3 +113,5 @@ trainer:
93113
use_dynamic_bsz: true
94114
max_token_len_per_gpu: 16384
95115
ulysses_sequence_parallel_size: 1
116+
monitor:
117+
monitor_type: wandb

0 commit comments

Comments
 (0)