inclusionAI · ZiyiTsang · Sep 25, 2025 · Sep 25, 2025 · Sep 26, 2025 · Sep 27, 2025
diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
@@ -7,6 +7,8 @@
 import uvloop
 import yaml
 
+from areal.utils.pkg_version import is_version_less
+
 uvloop.install()
 from hydra import compose as hydra_compose
 from hydra import initialize as hydra_init
@@ -54,6 +56,18 @@ class NormConfig:
     group_size: int = field(
         default=1, metadata={"help": "Group size for group-level normalization"}
     )
+    adv_norm_mode: str = field(
+        default="native",
+        metadata={
+            "help": "native or mix. 'native' is the normal z-score normalization. For 'mix', both normal z-score and mean-based z-score normalization will be calculated and aggregated (see MAPO paper for more details)."
+        },
+    )
+    reward_norm_mode: str = field(
+        default="native",
+        metadata={
+            "help": "Mode for reward normalization. Currently only 'native' is supported."
+        },
+    )
 
 
 @dataclass
@@ -625,6 +639,8 @@ def build_cmd(
         # convert to flags
         flags = []
         for k, v in args.items():
+            if is_version_less("sglang", "0.4.10.post2") and "max_loaded_loras" in k:
+                continue
             if v is None or v is False or v == "":
                 continue
             if v is True:
@@ -633,6 +649,7 @@ def build_cmd(
                 flags.append(f"--{k.replace('_','-')} {' '.join(map(str, v))}")
             else:
                 flags.append(f"--{k.replace('_','-')} {v}")
+
         return f"python3 -m sglang.launch_server {' '.join(flags)}"
 
     @staticmethod

diff --git a/areal/engine/ppo/actor.py b/areal/engine/ppo/actor.py
@@ -9,7 +9,8 @@
 from areal.utils import stats_tracker
 from areal.utils.data import (
     KLEstimator,
-    Normalization,
+    get_adv_norm,
+    get_reward_norm,
     split_padded_tensor_dict_into_mb_list,
 )
 from areal.utils.functional import (
@@ -35,10 +36,8 @@ def __init__(self, config: PPOActorConfig, engine: TrainEngine):
         self.kl_ctl = config.kl_ctl
         self.kl_estimator = KLEstimator(config.kl_estimator)
 
-        self.adv_norm = Normalization(config.adv_norm) if config.adv_norm else None
-        self.reward_norm = (
-            Normalization(config.reward_norm) if config.reward_norm else None
-        )
+        self.adv_norm = get_adv_norm(config)
+        self.reward_norm = get_reward_norm(config)
 
         self.discount = config.discount
         self.gae_lambda = config.gae_lambda

diff --git a/areal/utils/data.py b/areal/utils/data.py
@@ -11,7 +11,7 @@
 from einops import rearrange
 from torchdata.stateful_dataloader import StatefulDataLoader
 
-from areal.api.cli_args import MicroBatchSpec, NormConfig
+from areal.api.cli_args import MicroBatchSpec, NormConfig, PPOActorConfig
 from areal.platforms import current_platform
 from areal.utils import datapack, logging
 
@@ -1070,6 +1070,7 @@ def cycle_dataloader(dataloader: StatefulDataLoader):
             g = iter(dataloader)
 
 
+# base native normalization implementation (for both reward and adv norm)
 class Normalization:
     """
     Adaptive normalization with different levels.
@@ -1108,7 +1109,11 @@ def __call__(
         loss_mask: Optional[torch.Tensor] = None,
         high_precision: bool = True,
         reduce_group=None,
+        calculation_base: str = "deviation",
     ) -> torch.Tensor:
+
+        # x can be advantage or reward in shape [bs*self.group_size, max_tokens]
+
         bs = x.size(0)
         eps = self.eps
 
@@ -1200,8 +1205,15 @@ def __call__(
             std = torch.ones_like(x)
             eps = 0.0
 
+        assert calculation_base in [
+            "mean",
+            "deviation",
+        ], "calculation_base must be either mean or deviation"
+        base = std if calculation_base == "deviation" else mean
+        # Ensure stability
+        base += eps
         # Normalize
-        return (x_centered / (std + eps)).float()
+        return (x_centered / base).float()
 
     @staticmethod
     def _compute_mean(
@@ -1362,3 +1374,115 @@ def _compute_approx_kl(
         if apply_clamp:
             log_ratio = log_ratio.clamp(min=-10, max=10)
         return log_ratio
+
+
+# the mixed adv norm implementation to paper MAPO, derived from base native normalization implementation
+class MAPOAdvNorm(Normalization):
+    def __call__(self, advantages, loss_mask=None, **kwargs) -> torch.Tensor:
+        # Calculate the unique number of elements in advantages Tensor，exclude element of 0 (because 0 means adv over pad_token)
+
+        # deviation_base_norm shape [batch_size*group_size, max_token]
+        deviation_base_norm = super().__call__(
+            advantages, loss_mask=loss_mask, calculation_base="deviation", **kwargs
+        )
+
+        unique_elements = torch.unique(advantages[advantages != 0]).numel()
+
+        if unique_elements >= 3 or unique_elements <= 1:
+            # means all advantages are same but not 0
+            if unique_elements >= 3:
+                logger.warning(
+                    (
+                        f"The MAPO only support reward modeling in a binary, but detected {unique_elements} unique elements in advantages Tensor. Please check: "
+                        f"1. the definition of reward_fun: return the binary number "
+                        f"2. overlong_reward_panalty set to false"
+                    )
+                )
+            # means all advantages are same but not 0
+            else:
+                logger.info(
+                    (
+                        f"the advantage are all same in the batch, please check your reward function"
+                    )
+                )
+
+            logger.info((f"falling back to native advantage normalization"))
+            # fall back to native implementation is ok
+            return super().__call__(
+                advantages, loss_mask=loss_mask, calculation_base="deviation", **kwargs
+            )
+
+        # the 'unique_upper_value' means the reward of success trajectory
+        unique_upper_value, unique_lower_value = (
+            max(unique_elements).item(),
+            min(unique_elements).item(),
+        )
+
+        assert unique_elements <= 2, (
+            f"The MAPO only support reward modeling in a binary, but detected {unique_elements} unique elements in advantages Tensor. Please check: "
+            f"1. the definition of reward_fun: return the binary number "
+            f"2. overlong_reward_panalty set to false"
+        )
+
+        # mean_base_norm shape [batch_size*group_size, max_token]
+        mean_base_norm = super().__call__(
+            advantages, loss_mask=loss_mask, calculation_base="mean", **kwargs
+        )
+
+        bs, max_token = int(advantages.shape[0] / self.group_size), advantages.shape[-1]
+
+        # since the advantages is same within same trajectory, we can get the trajectory_level advantage from first token
+        # base on assumption that the advantage on last dim are totally same
+
+        advantages_ = advantages[:, 0]  # advantages shape [batch_size*group_size]
+
+        advantages_ = advantages_.reshape(
+            bs, self.group_size
+        )  # advantages shape [batch_size, group_size]
+
+        # the number of sucess trajectory within each group and batch
+        success_trajectory_nums_per_group = (advantages_ == unique_upper_value).sum(
+            dim=1
+        )  # success_trajectory_nums shape [batch_size]
+        # the number of total trajectory within each group
+        total_trajectory_nums_per_group = torch.tensor([self.group_size] * bs).to(
+            device=success_trajectory_nums_per_group.device,
+            dtype=success_trajectory_nums_per_group.dtype,
+        )  # total_trajectory_nums shape [batch_size]
+        # the probability of success trajectory within each group and batch
+        trajectory_certainty_degree = (
+            success_trajectory_nums_per_group / total_trajectory_nums_per_group
+        )
+
+        # trajectory_reweight shape [batch_size], represent the reweight of tragetories
+        trajectory_reweight = (
+            4 * trajectory_certainty_degree * (1 - trajectory_certainty_degree)
+        )
+        # trajectory_reweight shape to expand each_token of advantages
+        # trajectory_reweight [batch_size]->[batch_size*group_size]->[batch_size*group_size, max_token],each trajectory has same reweight for each token.
+        # i.e. trajectory_reweight granularity: group-level-> trajectory-level->token-level
+        trajectory_reweight = (
+            trajectory_reweight.repeat_interleave(self.group_size)
+            .unsqueeze(-1)
+            .expand(-1, max_token)
+        )
+        # in this case 'trajectory_reweight' & 'deviation_base_norm' & 'mean_base_norm' have the same granularity
+        # torch auto broadcasting will automatically expand the dimension to do the calculation
+        return (
+            1 - trajectory_reweight
+        ) * deviation_base_norm + trajectory_reweight * mean_base_norm
+
+
+def get_reward_norm(config: PPOActorConfig):
+    if config.reward_norm:
+        return Normalization(config.reward_norm)
+    else:
+        return None
+
+
+def get_adv_norm(config: PPOActorConfig):
+    if config.adv_norm:
+        if config.adv_norm.adv_norm_mode == "mix":
+            return MAPOAdvNorm(config.adv_norm)
+        else:
+            return Normalization(config.adv_norm)
diff --git a/areal/utils/functional.py b/areal/utils/functional.py
@@ -107,6 +107,7 @@ def masked_normalization(
     high_precision=True,
     all_reduce=True,
     reduce_group=None,
+    calculation_base: str = "deviation",
 ):
     dtype = torch.float64 if high_precision else torch.float32
     x = x.to(dtype)
@@ -135,7 +136,17 @@ def masked_normalization(
     var = meansq - mean**2
     if unbiased:
         var *= factor / (factor - 1)
-    return ((x - mean) / (var.sqrt() + eps)).float()
+    assert calculation_base in [
+        "mean",
+        "deviation",
+    ], "calculation_base must be either mean or deviation"
+
+    std = var.sqrt()
+    base = std if calculation_base == "deviation" else mean
+    # Ensure stability
+    base = base + eps
+    # Normalize
+    return ((x - mean) / base).float()
 
 
 def ppo_actor_loss_fn(

diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -42,6 +42,7 @@ parts:
     - file: algorithms/dapo
     - file: algorithms/dr.GRPO
     - file: algorithms/litePPO
+    - file: algorithms/mapo
   - caption: Customization (Legacy)
     chapters:
     - file: legacy/customization/dataset

diff --git a/docs/algorithms/mapo.md b/docs/algorithms/mapo.md
@@ -0,0 +1,53 @@
+# Mixed Advantage Policy Optimization (MAPO)
+
+Last updated: Sep 27, 2025
+
+Author: [Ziyi ZENG](https://github.com/ZiyiTsang)
+
+![alt text](../figures/MAPO.jpg)
+
+This paper introduces Mixed Advantage Policy Optimization (MAPO), an improved Group Relative Policy Optimization (GRPO) strategy designed to enhance the reasoning performance of foundation models. While GRPO has been effective in post-training foundation models for reasoning tasks, it suffers from "advantage reversion" and "advantage mirror" problems, which lead to an unreasonable allocation of advantage across different query samples. MAPO addresses these limitations by introducing the concept of "trajectory certainty" and proposing an "Advantage Percent Deviation" (APD) for high-certainty trajectories. Furthermore, it dynamically reweights the advantage function based on trajectory certainty through "Trajectory Certainty Reweight" (TCR). This adaptive approach allows MAPO to configure the advantage function to account for sample-specific characteristics, thereby mitigating the shortcomings of prior advantage function formulations and producing more stable and accurate reasoning performance across diverse tasks.
+
+The overall surrogate objective is:
+
+
+$$\mathcal{J}_{\mathrm{GRPO}}(\theta)=\mathbb{E}_{q\sim\rho_{Q}}\mathbb{E}_{o\sim\pi_{old}(\cdot|q)}\left[\frac{1}{G}\sum_{i}^{G}f_{\epsilon}\left(\frac{\pi_{\theta}(o_{i}|q)}{\pi_{old}(o_{i}|q)},\hat{\Lambda}_{i}\right)\right]-\beta\mathbb{D}_{KL}[\pi_{\theta}||\pi_{ref}],$$
+where:
+$$f_\epsilon(x,y)=\min(xy,\mathrm{clip}(x,1-\epsilon,1+\epsilon)y)$$
+
+$$\lambda(p)=1-4p(1-p)\in[0,1]\quad(p\in[0,1])$$
+
+$$\hat{A}_i^*=(1-\lambda(p))*\underbrace{\frac{r_i-\mu}{\sigma}}_{\text{Deviation-based}}+\lambda(p)*\underbrace{\frac{r_i-\mu}{\mu}}_{\text{Mean-based}}.$$
+
+
+For more details:
+
+- AReal Detail: [Paper of AReal](https://arxiv.org/abs/2505.24298)
+
+- MAPO Detail: [Paper of MAPO](https://arxiv.org/abs/2509.18849v3)
+
+## Algorithm Core Parameters
+
+- `actor.adv_norm.aggregation_mode`: the implementation of adv_norm. 'native' is the z-score normalization used by GRPO, while 'mix' is the implementation for MAPO.
+
+## Notice
+For MAPO implementation, following constraints should be met:
+
+1. 'reward_function' should return binary result of any value. High value represents the successful trajectory, while the lower value represent the fail trajectory.
+2. the 'overlong_reward_panelty' should be disable
+
+
+## Example Usage
+
+We recommend to change the parameter within the configuration file
+(i.e. gsm8k_mapo.yaml).
+
+| Backend   | CMD                                                                                                                              |
+| --------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| **local** | `python3 -m areal.launcher.local examples/experimental/mapo/gsm8k_mapo.py --config examples/experimental/mapo/gsm8k_mapo.yaml --<other_args_to_overwrite>` |
+| **ray**   | `python3 -m areal.launcher.ray examples/experimental/mapo/gsm8k_mapo.py --config examples/experimental/mapo/gsm8k_mapo.yaml --<other_args_to_overwrite>`   |
+| **slurm** | `python3 -m areal.launcher.slurm examples/experimental/mapo/gsm8k_mapo.py --config examples/experimental/mapo/gsm8k_mapo.yaml --<other_args_to_overwrite>` |
+
+## Baselines
+
+We still lack baseline, welcome to contribute!
diff --git a/docs/cli_reference.md b/docs/cli_reference.md
@@ -274,14 +274,16 @@ Specification for splitting micro-batches during training.
 
 Configuration for reward/advantage normalization.
 
-| Parameter        | Type           | Default   | Description                                                                                                      |
-| ---------------- | -------------- | --------- | ---------------------------------------------------------------------------------------------------------------- |
-| `mean_level`     | string \| None | `"batch"` | Mean level for normalization. None for no mean normalization. **Choices:** `batch`, `group`, `None`              |
-| `mean_leave1out` | boolean        | `False`   | Whether to use leave-one-out average.                                                                            |
-| `std_level`      | string \| None | `"batch"` | Standard deviation level for normalization. None for no std normalization. **Choices:** `batch`, `group`, `None` |
-| `std_unbiased`   | boolean        | `True`    | Whether to use unbiased standard deviation computation. Defaults to True (changed from False in v0.3.4).         |
-| `eps`            | float          | `1e-05`   | The eps when dividing by standard deviation to avoid numerical issues.                                           |
-| `group_size`     | integer        | `1`       | Group size for group-level normalization                                                                         |
+| Parameter          | Type           | Default    | Description                                                                                                                                                                                           |
+| ------------------ | -------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mean_level`       | string \| None | `"batch"`  | Mean level for normalization. None for no mean normalization. **Choices:** `batch`, `group`, `None`                                                                                                   |
+| `mean_leave1out`   | boolean        | `False`    | Whether to use leave-one-out average.                                                                                                                                                                 |
+| `std_level`        | string \| None | `"batch"`  | Standard deviation level for normalization. None for no std normalization. **Choices:** `batch`, `group`, `None`                                                                                      |
+| `std_unbiased`     | boolean        | `True`     | Whether to use unbiased standard deviation computation. Defaults to True (changed from False in v0.3.4).                                                                                              |
+| `eps`              | float          | `1e-05`    | The eps when dividing by standard deviation to avoid numerical issues.                                                                                                                                |
+| `group_size`       | integer        | `1`        | Group size for group-level normalization                                                                                                                                                              |
+| `adv_norm_mode`    | string         | `"native"` | native or mix. 'native' is the normal z-score normalization. For 'mix', both normal z-score and mean-based z-score normalization will be calculated and aggregated (see MAPO paper for more details). |
+| `reward_norm_mode` | string         | `"native"` | Mode for reward normalization. Currently only 'native' is supported.                                                                                                                                  |
 
 (section-optimizer)=
 

diff --git a/docs/figures/MAPO.jpg b/docs/figures/MAPO.jpg
diff --git a/examples/experimental/dr.grpo/gsm8k_drgrpo.yaml b/examples/experimental/dr.grpo/gsm8k_drgrpo.yaml
@@ -1,4 +1,4 @@
-experiment_name: gsm8k-grpo
+experiment_name: gsm8k-drgrpo
 trial_name: trial0
 
 seed: 1

diff --git a/examples/experimental/lite_ppo/gsm8k_liteppo.yaml b/examples/experimental/lite_ppo/gsm8k_liteppo.yaml
@@ -1,4 +1,4 @@
-experiment_name: gsm8k-grpo
+experiment_name: gsm8k-liteppo
 trial_name: trial0
 
 seed: 1