Skip to content
Open
15 changes: 15 additions & 0 deletions areal/api/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,18 @@ class NormConfig:
group_size: int = field(
default=1, metadata={"help": "Group size for group-level normalization"}
)
adv_norm_mode: str = field(
default="native",
metadata={
"help": "native or mix. 'native' is the normal z-score normalization. For 'mix', both normal z-score and mean-based z-score normalization will be calculated and aggregated (see MAPO paper for more details)."
},
)
reward_norm_mode: str = field(
default="native",
metadata={
"help": "Mode for reward normalization. Currently only 'native' is supported."
},
)


@dataclass
Expand Down Expand Up @@ -617,6 +629,8 @@ def build_cmd(
# convert to flags
flags = []
for k, v in args.items():
if "max_loaded_loras" in k:
continue
if v is None or v is False or v == "":
continue
if v is True:
Expand All @@ -625,6 +639,7 @@ def build_cmd(
flags.append(f"--{k.replace('_','-')} {' '.join(map(str, v))}")
else:
flags.append(f"--{k.replace('_','-')} {v}")

return f"python3 -m sglang.launch_server {' '.join(flags)}"

@staticmethod
Expand Down
12 changes: 7 additions & 5 deletions areal/engine/ppo/actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
from areal.api.engine_api import TrainEngine
from areal.engine.fsdp_engine import FSDPEngine
from areal.utils import stats_tracker
from areal.utils.data import Normalization, split_padded_tensor_dict_into_mb_list
from areal.utils.data import (
get_adv_norm,
get_reward_norm,
split_padded_tensor_dict_into_mb_list,
)
from areal.utils.functional import (
dynamic_sampling,
gather_logprobs,
Expand All @@ -31,10 +35,8 @@ def __init__(self, config: PPOActorConfig, engine: TrainEngine):

self.kl_ctl = config.kl_ctl

self.adv_norm = Normalization(config.adv_norm) if config.adv_norm else None
self.reward_norm = (
Normalization(config.reward_norm) if config.reward_norm else None
)
self.adv_norm = get_adv_norm(config)
self.reward_norm = get_reward_norm(config)

self.discount = config.discount
self.gae_lambda = config.gae_lambda
Expand Down
128 changes: 126 additions & 2 deletions areal/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from einops import rearrange
from torchdata.stateful_dataloader import StatefulDataLoader

from areal.api.cli_args import MicroBatchSpec, NormConfig
from areal.api.cli_args import MicroBatchSpec, NormConfig, PPOActorConfig
from areal.platforms import current_platform
from areal.utils import datapack, logging

Expand Down Expand Up @@ -1070,6 +1070,7 @@ def cycle_dataloader(dataloader: StatefulDataLoader):
g = iter(dataloader)


# base native normalization implementation (for both reward and adv norm)
class Normalization:
"""
Adaptive normalization with different levels.
Expand Down Expand Up @@ -1108,7 +1109,11 @@ def __call__(
loss_mask: Optional[torch.Tensor] = None,
high_precision: bool = True,
reduce_group=None,
calculation_base: str = "deviation",
) -> torch.Tensor:

# x can be advantage or reward in shape [bs*self.group_size, max_tokens]

bs = x.size(0)
eps = self.eps

Expand Down Expand Up @@ -1200,8 +1205,15 @@ def __call__(
std = torch.ones_like(x)
eps = 0.0

assert calculation_base in [
"mean",
"deviation",
], "calculation_base must be either mean or deviation"
base = std if calculation_base == "deviation" else mean
# Ensure stability
base += eps
# Normalize
return (x_centered / (std + eps)).float()
return (x_centered / base).float()

@staticmethod
def _compute_mean(
Expand Down Expand Up @@ -1301,3 +1313,115 @@ def _compute_std(
if factor.item() == 0:
return torch.ones_like(x_sum_sq)
return (x_sum_sq / factor).sqrt()


# the mixed adv norm implementation to paper MAPO, derived from base native normalization implementation
class MAPOAdvNorm(Normalization):
def __call__(self, advantages, loss_mask=None, **kwargs) -> torch.Tensor:
# Calculate the unique number of elements in advantages Tensor,exclude element of 0 (because 0 means adv over pad_token)

# deviation_base_norm shape [batch_size*group_size, max_token]
deviation_base_norm = super().__call__(
advantages, loss_mask=loss_mask, calculation_base="deviation", **kwargs
)

unique_elements = torch.unique(advantages[advantages != 0]).numel()

if unique_elements >= 3 or unique_elements <= 1:
# means all advantages are same but not 0
if unique_elements >= 3:
logger.warning(
(
f"The MAPO only support reward modeling in a binary, but detected {unique_elements} unique elements in advantages Tensor. Please check: "
f"1. the definition of reward_fun: return the binary number "
f"2. overlong_reward_panalty set to false"
)
)
# means all advantages are same but not 0
else:
logger.info(
(
f"the advantage are all same in the batch, please check your reward function"
)
)

logger.info((f"falling back to native advantage normalization"))
# fall back to native implementation is ok
return super().__call__(
advantages, loss_mask=loss_mask, calculation_base="deviation", **kwargs
)

# the 'unique_upper_value' means the reward of success trajectory
unique_upper_value, unique_lower_value = max(unique_elements), min(
unique_elements
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Calling max(unique_elements) will raise a runtime error if unique_elements is an empty tensor, which can happen if all advantages are zero. You should add a check to handle this edge case before calling max() and min().

        if unique_elements.numel() == 0:
            # All non-padded advantages are 0, return as is.
            return advantages.float()
        unique_upper_value, unique_lower_value = max(unique_elements), min(
            unique_elements
        )

unique_elements = unique_elements.numel()

assert unique_elements <= 2, (
f"The MAPO only support reward modeling in a binary, but detected {unique_elements} unique elements in advantages Tensor. Please check: "
f"1. the definition of reward_fun: return the binary number "
f"2. overlong_reward_panalty set to false"
)

# mean_base_norm shape [batch_size*group_size, max_token]
mean_base_norm = super().__call__(
advantages, loss_mask=loss_mask, calculation_base="mean", **kwargs
)

bs, max_token = int(advantages.shape[0] / self.group_size), advantages.shape[-1]

# since the advantages is same within same trajectory, we can get the trajectory_level advantage from first token
# base on assumption that the advantage on last dim are totally same

advantages_ = advantages[:, 0] # advantages shape [batch_size*group_size]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This line assumes that the advantages tensor is constant across the time dimension for each trajectory (advantages[:, 0]). This assumption is incorrect for GAE-calculated advantages, which are per-token. The MAPO paper normalizes trajectory-level returns, not per-token advantages. This implementation appears to be applying the normalization logic to the wrong tensor, which is a fundamental flaw in the algorithm's implementation.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We exactly assume that the advantages tensor is constant across the time dimension for each trajectory. Not good for PPO but GRPO is make sense.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line does not take any effect and should be removed.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, useful. Please see the code comment.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The adv of first token is extract and use for below logic.


advantages_ = advantages_.reshape(
bs, self.group_size
) # advantages shape [batch_size, group_size]

# the number of sucess trajectory within each group and batch
success_trajectory_nums_per_group = (advantages_ == unique_upper_value).sum(
dim=1
) # success_trajectory_nums shape [batch_size]
# the number of total trajectory within each group
total_trajectory_nums_per_group = torch.tensor([self.group_size] * bs).to(
device=success_trajectory_nums_per_group.device,
dtype=success_trajectory_nums_per_group.dtype,
) # total_trajectory_nums shape [batch_size]
# the probability of success trajectory within each group and batch
trajectory_certainty_degree = (
success_trajectory_nums_per_group / total_trajectory_nums_per_group
)

# trajectory_reweight shape [batch_size], represent the reweight of tragetories
trajectory_reweight = (
4 * trajectory_certainty_degree * (1 - trajectory_certainty_degree)
)
# trajectory_reweight shape to expand each_token of advantages
# trajectory_reweight [batch_size]->[batch_size*group_size]->[batch_size*group_size, max_token],each trajectory has same reweight for each token.
# i.e. trajectory_reweight granularity: group-level-> trajectory-level->token-level
trajectory_reweight = (
trajectory_reweight.repeat_interleave(self.group_size)
.unsqueeze(-1)
.expand(-1, max_token)
)
# in this case 'trajectory_reweight' & 'deviation_base_norm' & 'mean_base_norm' have the same granularity
# torch auto broadcasting will automatically expand the dimension to do the calculation
return (
1 - trajectory_reweight
) * deviation_base_norm + trajectory_reweight * mean_base_norm
Comment on lines +1469 to +1471
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

double-check the formula. Since the trajectory_weight is computed as 4p(1-p) rather than 1-4p(1-p), should we reverse the weighting of these two norms?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my mistake. thank you



def get_reward_norm(config: PPOActorConfig):
if config.reward_norm:
return Normalization(config.reward_norm)
else:
return None


def get_adv_norm(config: PPOActorConfig):
if config.adv_norm:
if config.adv_norm.adv_norm_mode == "mix":
return MAPOAdvNorm(config.adv_norm)
else:
return Normalization(config.adv_norm)
13 changes: 12 additions & 1 deletion areal/utils/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def masked_normalization(
high_precision=True,
all_reduce=True,
reduce_group=None,
calculation_base: str = "deviation",
):
dtype = torch.float64 if high_precision else torch.float32
x = x.to(dtype)
Expand Down Expand Up @@ -124,7 +125,17 @@ def masked_normalization(
var = meansq - mean**2
if unbiased:
var *= factor / (factor - 1)
return ((x - mean) / (var.sqrt() + eps)).float()
assert calculation_base in [
"mean",
"deviation",
], "calculation_base must be either mean or deviation"

std = var.sqrt()
base = std if calculation_base == "deviation" else mean
# Ensure stability
base = base + eps
# Normalize
return ((x - mean) / base).float()


def ppo_actor_loss_fn(
Expand Down
1 change: 1 addition & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ parts:
- file: algorithms/dapo
- file: algorithms/dr.GRPO
- file: algorithms/litePPO
- file: algorithms/mapo
- caption: Customization (Legacy)
chapters:
- file: legacy/customization/dataset
Expand Down
53 changes: 53 additions & 0 deletions docs/algorithms/mapo.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Mixed Advantage Policy Optimization (MAPO)

Last updated: Sep 27, 2025

Author: [Ziyi ZENG](https://github.com/ZiyiTsang)

![alt text](../figures/MAPO.jpg)

This paper introduces Mixed Advantage Policy Optimization (MAPO), an improved Group Relative Policy Optimization (GRPO) strategy designed to enhance the reasoning performance of foundation models. While GRPO has been effective in post-training foundation models for reasoning tasks, it suffers from "advantage reversion" and "advantage mirror" problems, which lead to an unreasonable allocation of advantage across different query samples. MAPO addresses these limitations by introducing the concept of "trajectory certainty" and proposing an "Advantage Percent Deviation" (APD) for high-certainty trajectories. Furthermore, it dynamically reweights the advantage function based on trajectory certainty through "Trajectory Certainty Reweight" (TCR). This adaptive approach allows MAPO to configure the advantage function to account for sample-specific characteristics, thereby mitigating the shortcomings of prior advantage function formulations and producing more stable and accurate reasoning performance across diverse tasks.

The overall surrogate objective is:


$$\mathcal{J}_{\mathrm{GRPO}}(\theta)=\mathbb{E}_{q\sim\rho_{Q}}\mathbb{E}_{o\sim\pi_{old}(\cdot|q)}\left[\frac{1}{G}\sum_{i}^{G}f_{\epsilon}\left(\frac{\pi_{\theta}(o_{i}|q)}{\pi_{old}(o_{i}|q)},\hat{\Lambda}_{i}\right)\right]-\beta\mathbb{D}_{KL}[\pi_{\theta}||\pi_{ref}],$$
where:
$$f_\epsilon(x,y)=\min(xy,\mathrm{clip}(x,1-\epsilon,1+\epsilon)y)$$

$$\lambda(p)=1-4p(1-p)\in[0,1]\quad(p\in[0,1])$$

$$\hat{A}_i^*=(1-\lambda(p))*\underbrace{\frac{r_i-\mu}{\sigma}}_{\text{Deviation-based}}+\lambda(p)*\underbrace{\frac{r_i-\mu}{\mu}}_{\text{Mean-based}}.$$


For more details:

- AReal Detail: [Paper of AReal](https://arxiv.org/abs/2505.24298)

- MAPO Detail: [Paper of MAPO](https://arxiv.org/abs/2509.18849v3)

## Algorithm Core Parameters

- `actor.adv_norm.aggregation_mode`: the implementation of adv_norm. 'native' is the z-score normalization used by GRPO, while 'mix' is the implementation for MAPO.

## Notice
For MAPO implementation, following constraints should be met:

1. 'reward_function' should return binary result of any value. High value represents the successful trajectory, while the lower value represent the fail trajectory.
2. the 'overlong_reward_panelty' should be disable


## Example Usage

We recommend to change the parameter within the configuration file
(i.e. gsm8k_mapo.yaml).

| Backend | CMD |
| --------- | -------------------------------------------------------------------------------------------------------------------------------- |
| **local** | `python3 -m areal.launcher.local examples/experimental/mapo/gsm8k_mapo.py --config examples/experimental/mapo/gsm8k_mapo.yaml --<other_args_to_overwrite>` |
| **ray** | `python3 -m areal.launcher.ray examples/experimental/mapo/gsm8k_mapo.py --config examples/experimental/mapo/gsm8k_mapo.yaml --<other_args_to_overwrite>` |
| **slurm** | `python3 -m areal.launcher.slurm examples/experimental/mapo/gsm8k_mapo.py --config examples/experimental/mapo/gsm8k_mapo.yaml --<other_args_to_overwrite>` |

## Baselines

We still lack baseline, welcome to contribute!
23 changes: 15 additions & 8 deletions docs/cli_reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,14 +274,16 @@ Specification for splitting micro-batches during training.

Configuration for reward/advantage normalization.

| Parameter | Type | Default | Description |
| ---------------- | -------------- | --------- | ------------------------------------------------------------------------------------------------- |
| `mean_level` | string \| None | `"batch"` | Mean level for normalization. Choices: batch, group. Omit for no mean normalization. |
| `mean_leave1out` | boolean | `False` | Whether to use leave-one-out average. |
| `std_level` | string \| None | `"batch"` | Standard deviation level for normalization. Choices: batch, group. Omit for no std normalization. |
| `std_unbiased` | boolean | `False` | Whether to use unbiased standard deviation computation. |
| `eps` | float | `1e-05` | The eps when dividing by standard deviation to avoid numerical issues. |
| `group_size` | integer | `1` | Group size for group-level normalization |
| Parameter | Type | Default | Description |
| ------------------ | -------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `mean_level` | string \| None | `"batch"` | Mean level for normalization. Choices: batch, group. Omit for no mean normalization. |
| `mean_leave1out` | boolean | `False` | Whether to use leave-one-out average. |
| `std_level` | string \| None | `"batch"` | Standard deviation level for normalization. Choices: batch, group. Omit for no std normalization. |
| `std_unbiased` | boolean | `True` | Whether to use unbiased standard deviation computation. Defaults to True (changed from False in v0.3.4). |
| `eps` | float | `1e-05` | The eps when dividing by standard deviation to avoid numerical issues. |
| `group_size` | integer | `1` | Group size for group-level normalization |
| `adv_norm_mode` | string | `"native"` | native or mix. 'native' is the normal z-score normalization. For 'mix', both normal z-score and mean-based z-score normalization will be calculated and aggregated (see MAPO paper for more details). |
| `reward_norm_mode` | string | `"native"` | Mode for reward normalization. Currently only 'native' is supported. |

(section-optimizer)=

Expand Down Expand Up @@ -384,6 +386,11 @@ Configuration for PPO critic model, a subclass of a TrainEngine.
| `optimizer` | [`OptimizerConfig`](section-optimizer) \| None | `None` | Optimizer configuration. None means no training. |
| `backend` | string | `""` | Training backend (refer to documentation) |
| `fsdp` | [`FSDPEngineConfig`](section-fsdp-engine) | **Required** | - |
| `use_lora` | boolean | `False` | Whether to use LoRA. Only support FSDP. Note that should be enabled together with vLLM/SGLang. |
| `lora_rank` | integer | `32` | lora rank |
| `lora_alpha` | integer | `16` | lora alpha |
| `target_modules` | list of string | **Required** | lora target_modules. None defaults to 'all-linear' |
| `peft_type` | string | `"lora"` | peft method type. Only LoRA is supported for now. |
| `ppo_n_minibatches` | integer | `4` | Number of minibatches for each PPO update |
| `eps_clip` | float | `0.5` | Clipping factor for value loss |
| `mask_no_eos_with_zero` | boolean | `False` | Mask truncated generations (no EOS token) and exclude from training |
Expand Down
Binary file added docs/figures/MAPO.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion examples/experimental/dr.grpo/gsm8k_drgrpo.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
experiment_name: gsm8k-grpo
experiment_name: gsm8k-drgrpo
trial_name: trial0

seed: 1
Expand Down
Loading
Loading