We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent f0c2490 commit 76d1d1fCopy full SHA for 76d1d1f
docs/source/paper_index.md
@@ -592,7 +592,8 @@ from trl import GRPOConfig
592
593
training_args = GRPOConfig(
594
loss_type="vespo",
595
- importance_sampling_level="token",
+ use_vllm=True, # or False if not using any token-level `vllm_importance_sampling_correction` methods
596
+ vllm_importance_sampling_mode="token_truncate", # default correction mode for VESPO, `token_mask` also supported
597
vespo_k_pos=2.0, # Power exponent (c1 in paper Section 3.4) for positive advantages
598
vespo_lambda_pos=3.0, # Decay factor (c2 in paper Section 3.4) for positive advantages
599
vespo_k_neg=3.0, # Power exponent (c1 in paper Section 3.4) for negative advantages
0 commit comments