inclusionAI · ZiyiTsang · Sep 25, 2025 · Sep 25, 2025 · Sep 26, 2025 · Sep 27, 2025
diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
@@ -35,6 +35,18 @@ class NormConfig:
     group_size: int = field(
         default=1, metadata={"help": "Group size for group-level normalization"}
     )
+    adv_norm_mode: str = field(
+        default="native",
+        metadata={
+            "help": "native or mix. 'native' is the normal z-score normalization. For 'mix', both normal z-score and mean-based z-score normalization will be calculated and aggregated (see MAPO paper for more details)."
+        },
+    )
+    reward_norm_mode: str = field(
+        default="native",
+        metadata={
+            "help": "Mode for reward normalization. Currently only 'native' is supported."
+        },
+    )
 
 
 @dataclass

diff --git a/areal/engine/ppo/actor.py b/areal/engine/ppo/actor.py
@@ -7,7 +7,11 @@
 from areal.api.engine_api import TrainEngine
 from areal.engine.fsdp_engine import FSDPEngine
 from areal.utils import stats_tracker
-from areal.utils.data import Normalization, split_padded_tensor_dict_into_mb_list
+from areal.utils.data import (
+    get_adv_norm,
+    get_reward_norm,
+    split_padded_tensor_dict_into_mb_list,
+)
 from areal.utils.functional import (
     dynamic_sampling,
     gather_logprobs,
@@ -31,10 +35,8 @@ def __init__(self, config: PPOActorConfig, engine: TrainEngine):
 
         self.kl_ctl = config.kl_ctl
 
-        self.adv_norm = Normalization(config.adv_norm) if config.adv_norm else None
-        self.reward_norm = (
-            Normalization(config.reward_norm) if config.reward_norm else None
-        )
+        self.adv_norm = get_adv_norm(config)
+        self.reward_norm = get_reward_norm(config)
 
         self.discount = config.discount
         self.gae_lambda = config.gae_lambda

diff --git a/areal/utils/data.py b/areal/utils/data.py
diff --git a/areal/utils/functional.py b/areal/utils/functional.py
@@ -96,6 +96,7 @@ def masked_normalization(
     high_precision=True,
     all_reduce=True,
     reduce_group=None,
+    calculation_base: str = "deviation",
 ):
     dtype = torch.float64 if high_precision else torch.float32
     x = x.to(dtype)
@@ -124,7 +125,17 @@ def masked_normalization(
     var = meansq - mean**2
     if unbiased:
         var *= factor / (factor - 1)
-    return ((x - mean) / (var.sqrt() + eps)).float()
+    assert calculation_base in [
+        "mean",
+        "deviation",
+    ], "calculation_base must be either mean or deviation"
+
+    std = var.sqrt()
+    base = std if calculation_base == "deviation" else mean
+    # Ensure stability
+    base = base + eps
+    # Normalize
+    return ((x - mean) / base).float()
 
 
 def ppo_actor_loss_fn(

diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -41,6 +41,7 @@ parts:
     - file: algorithms/dapo
     - file: algorithms/dr.GRPO
     - file: algorithms/litePPO
+    - file: algorithms/mapo
   - caption: Customization (Legacy)
     chapters:
     - file: legacy/customization/dataset

diff --git a/docs/algorithms/mapo.md b/docs/algorithms/mapo.md
@@ -0,0 +1,53 @@
+# Mixed Advantage Policy Optimization (MAPO)
+
+Last updated: Sep 27, 2025
+
+Author: [Ziyi ZENG](https://github.com/ZiyiTsang)
+
+![alt text](../figures/MAPO.jpg)
+
+This paper introduces Mixed Advantage Policy Optimization (MAPO), an improved Group Relative Policy Optimization (GRPO) strategy designed to enhance the reasoning performance of foundation models. While GRPO has been effective in post-training foundation models for reasoning tasks, it suffers from "advantage reversion" and "advantage mirror" problems, which lead to an unreasonable allocation of advantage across different query samples. MAPO addresses these limitations by introducing the concept of "trajectory certainty" and proposing an "Advantage Percent Deviation" (APD) for high-certainty trajectories. Furthermore, it dynamically reweights the advantage function based on trajectory certainty through "Trajectory Certainty Reweight" (TCR). This adaptive approach allows MAPO to configure the advantage function to account for sample-specific characteristics, thereby mitigating the shortcomings of prior advantage function formulations and producing more stable and accurate reasoning performance across diverse tasks.
+
+The overall surrogate objective is:
+
+
+$$\mathcal{J}_{\mathrm{GRPO}}(\theta)=\mathbb{E}_{q\sim\rho_{Q}}\mathbb{E}_{o\sim\pi_{old}(\cdot|q)}\left[\frac{1}{G}\sum_{i}^{G}f_{\epsilon}\left(\frac{\pi_{\theta}(o_{i}|q)}{\pi_{old}(o_{i}|q)},\hat{\Lambda}_{i}\right)\right]-\beta\mathbb{D}_{KL}[\pi_{\theta}||\pi_{ref}],$$
+where:
+$$f_\epsilon(x,y)=\min(xy,\mathrm{clip}(x,1-\epsilon,1+\epsilon)y)$$
+
+$$\lambda(p)=1-4p(1-p)\in[0,1]\quad(p\in[0,1])$$
+
+$$\hat{A}_i^*=(1-\lambda(p))*\underbrace{\frac{r_i-\mu}{\sigma}}_{\text{Deviation-based}}+\lambda(p)*\underbrace{\frac{r_i-\mu}{\mu}}_{\text{Mean-based}}.$$
+
+
+For more details:
+
+- AReal Detail: [Paper of AReal](https://arxiv.org/abs/2505.24298)
+
+- MAPO Detail: [Paper of MAPO](https://arxiv.org/abs/2509.18849v3)
+
+## Algorithm Core Parameters
+
+- `actor.adv_norm.aggregation_mode`: the implementation of adv_norm. 'native' is the z-score normalization used by GRPO, while 'mix' is the implementation for MAPO.
+
+## Notice
+For MAPO implementation, following constraints should be met:
+
+1. 'reward_function' should return binary result of any value. High value represents the successful trajectory, while the lower value represent the fail trajectory.
+2. the 'overlong_reward_panelty' should be disable
+
+
+## Example Usage
+
+We recommend to change the parameter within the configuration file
+(i.e. gsm8k_mapo.yaml).
+
+| Backend   | CMD                                                                                                                              |
+| --------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| **local** | `python3 -m areal.launcher.local examples/experimental/mapo/gsm8k_mapo.py --config examples/experimental/mapo/gsm8k_mapo.yaml --<other_args_to_overwrite>` |
+| **ray**   | `python3 -m areal.launcher.ray examples/experimental/mapo/gsm8k_mapo.py --config examples/experimental/mapo/gsm8k_mapo.yaml --<other_args_to_overwrite>`   |
+| **slurm** | `python3 -m areal.launcher.slurm examples/experimental/mapo/gsm8k_mapo.py --config examples/experimental/mapo/gsm8k_mapo.yaml --<other_args_to_overwrite>` |
+
+## Baselines
+
+We still lack baseline, welcome to contribute!
diff --git a/docs/cli_reference.md b/docs/cli_reference.md
@@ -274,11 +274,13 @@ Specification for splitting micro-batches during training.
 
 Configuration for reward/advantage normalization.
 
-| Parameter    | Type           | Default   | Description                                                                                       |
-| ------------ | -------------- | --------- | ------------------------------------------------------------------------------------------------- |
-| `mean_level` | string \| None | `"batch"` | Mean level for normalization. Choices: batch, group. Omit for no mean normalization.              |
-| `std_level`  | string \| None | `"batch"` | Standard deviation level for normalization. Choices: batch, group. Omit for no std normalization. |
-| `group_size` | integer        | `1`       | Group size for group-level normalization                                                          |
+| Parameter          | Type           | Default    | Description                                                                                                                                                                                           |
+| ------------------ | -------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mean_level`       | string \| None | `"batch"`  | Mean level for normalization. Choices: batch, group. Omit for no mean normalization.                                                                                                                  |
+| `std_level`        | string \| None | `"batch"`  | Standard deviation level for normalization. Choices: batch, group. Omit for no std normalization.                                                                                                     |
+| `group_size`       | integer        | `1`        | Group size for group-level normalization                                                                                                                                                              |
+| `adv_norm_mode`    | string         | `"native"` | native or mix. 'native' is the normal z-score normalization. For 'mix', both normal z-score and mean-based z-score normalization will be calculated and aggregated (see MAPO paper for more details). |
+| `reward_norm_mode` | string         | `"native"` | Mode for reward normalization. Currently only 'native' is supported.                                                                                                                                  |
 
 (section-optimizer)=
 
@@ -381,6 +383,11 @@ Configuration for PPO critic model, a subclass of a TrainEngine.
 | `optimizer`              | [`OptimizerConfig`](section-optimizer) \| None | `None`                | Optimizer configuration. None means no training.                                                                                        |
 | `backend`                | string                                         | `""`                  | Training backend (refer to documentation)                                                                                               |
 | `fsdp`                   | [`FSDPEngineConfig`](section-fsdp-engine)      | **Required**          | -                                                                                                                                       |
+| `use_lora`               | boolean                                        | `False`               | Whether to use LoRA. Only support FSDP. Note that should be enabled together with vLLM/SGLang.                                          |
+| `lora_rank`              | integer                                        | `32`                  | lora rank                                                                                                                               |
+| `lora_alpha`             | integer                                        | `16`                  | lora alpha                                                                                                                              |
+| `target_modules`         | list of string                                 | **Required**          | lora target_modules. None defaults to 'all-linear'                                                                                      |
+| `peft_type`              | string                                         | `"lora"`              | peft method type. Only LoRA is supported for now.                                                                                       |
 | `ppo_n_minibatches`      | integer                                        | `4`                   | Number of minibatches for each PPO update                                                                                               |
 | `eps_clip`               | float                                          | `0.5`                 | Clipping factor for value loss                                                                                                          |
 | `mask_no_eos_with_zero`  | boolean                                        | `False`               | Mask truncated generations (no EOS token) and exclude from training                                                                     |

diff --git a/docs/figures/MAPO.jpg b/docs/figures/MAPO.jpg
diff --git a/examples/experimental/dr.grpo/gsm8k_drgrpo.yaml b/examples/experimental/dr.grpo/gsm8k_drgrpo.yaml
@@ -1,4 +1,4 @@
-experiment_name: gsm8k-grpo
+experiment_name: gsm8k-drgrpo
 trial_name: trial0
 
 seed: 1

diff --git a/examples/experimental/lite_ppo/gsm8k_liteppo.yaml b/examples/experimental/lite_ppo/gsm8k_liteppo.yaml
@@ -1,4 +1,4 @@
-experiment_name: gsm8k-grpo
+experiment_name: gsm8k-liteppo
 trial_name: trial0
 
 seed: 1