alibaba · PanAndy · Aug 20, 2025 · Aug 20, 2025
diff --git a/README.md b/README.md
@@ -84,10 +84,9 @@ Leveraging a multi-role distributed architecture with Ray for flexible resource
 [RAFT++](https://alibaba.github.io/ROLL/docs/English/UserGuide/algorithms/RAFT_Plus_Plus)  
 [StarPO](https://alibaba.github.io/ROLL/docs/English/UserGuide/algorithms/agentic_StarPO)
 
-#### Beckend
+#### Backend
 [DeepSeed](https://alibaba.github.io/ROLL/docs/English/UserGuide/backend/deepspeed)  
-[Megatron](https://alibaba.github.io/ROLL/docs/English/UserGuide/backend/megatron)  
-[LoRA](https://alibaba.github.io/ROLL/docs/English/UserGuide/backend/lora)  
+[Megatron](https://alibaba.github.io/ROLL/docs/English/UserGuide/backend/megatron)   
 [vLLM](https://alibaba.github.io/ROLL/docs/English/UserGuide/backend/vllm)  
 [SGLang](https://alibaba.github.io/ROLL/docs/English/UserGuide/backend/sglang)
 
@@ -119,7 +118,7 @@ Leveraging a multi-role distributed architecture with Ray for flexible resource
     *   Inference/Generation supports vLLM, SGLang.
     *   Training supports DeepSpeed (ZeRO), Megatron-LM 5D parallelism (mcore-adapter, dp/tp/pp/cp/ep), FSDP under implementation.
     *   Extreme offload/reload capabilities.
-    *   Supports LoRA training.
+    *   Supports [LoRA](https://alibaba.github.io/ROLL/docs/English/UserGuide/backend/lora) training.
     *   Supports FP8 rollout (FP8 inference for LLM as judge, FP8 rollout with BF16 training under development).
 *   **AutoDeviceMapping:** Supports custom device mapping for different roles, flexibly managing colocated and disaggregated deployments.
 *   **Observability:** Integrated with SwanLab / WandB / TensorBoard, tracking of performance for each domain and reward type.

diff --git a/docs_roll/docs/English/UserGuide/algorithms/GRPO.md b/docs_roll/docs/English/UserGuide/algorithms/GRPO.md
@@ -26,7 +26,7 @@ adv_estimator: "grpo"
 ppo_epochs: 1
 use_kl_loss: true
 kl_loss_coef: 0.001
-loss_agg_mode: "seq-mean-token-sum"
+loss_agg_mode: "seq-mean-token-mean"
 
 # ppo related
 # advantage

diff --git a/docs_roll/docs/简体中文/使用指南/algorithms/GRPO.md b/docs_roll/docs/简体中文/使用指南/algorithms/GRPO.md
@@ -26,7 +26,7 @@ adv_estimator: "grpo"
 ppo_epochs: 1
 use_kl_loss: true
 kl_loss_coef: 0.001
-loss_agg_mode: "seq-mean-token-sum"
+loss_agg_mode: "seq-mean-token-mean"
 
 # ppo related
 # advantage