support different kl estimator, support reinforce++ and reinforce++-baseline (#408)

dhh1995 · web-flow · commit a80d33f05feb · 2025-09-30T18:11:45.000+08:00
diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
@@ -23,7 +23,8 @@ class NormConfig:
     mean_level: str | None = field(
         default="batch",
         metadata={
-            "help": "Mean level for normalization. Choices: batch, group. Omit for no mean normalization."
+            "help": "Mean level for normalization. None for no mean normalization.",
+            "choices": ["batch", "group", None],
         },
     )
     mean_leave1out: bool = field(
@@ -33,7 +34,8 @@ class NormConfig:
     std_level: str | None = field(
         default="batch",
         metadata={
-            "help": "Standard deviation level for normalization. Choices: batch, group. Omit for no std normalization."
+            "help": "Standard deviation level for normalization. None for no std normalization.",
+            "choices": ["batch", "group", None],
         },
     )
     std_unbiased: bool = field(
@@ -374,6 +376,10 @@ class PPOActorConfig(TrainEngineConfig):
 
     # KL Control
     kl_ctl: float = field(default=0.1, metadata={"help": "KL divergence coefficient"})
+    kl_estimator: str = field(
+        default="k1",
+        metadata={"help": "KL divergence estimator", "choices": ["k1", "k2", "k3"]},
+    )
 
     # Asynchronous RL
     recompute_logprob: bool = field(
diff --git a/areal/engine/ppo/actor.py b/areal/engine/ppo/actor.py
@@ -7,7 +7,11 @@
 from areal.api.engine_api import TrainEngine
 from areal.engine.fsdp_engine import FSDPEngine
 from areal.utils import stats_tracker
-from areal.utils.data import Normalization, split_padded_tensor_dict_into_mb_list
+from areal.utils.data import (
+    KLEstimator,
+    Normalization,
+    split_padded_tensor_dict_into_mb_list,
+)
 from areal.utils.functional import (
     dynamic_sampling,
     gather_logprobs,
@@ -30,6 +34,7 @@ def __init__(self, config: PPOActorConfig, engine: TrainEngine):
         self.group_size = config.group_size
 
         self.kl_ctl = config.kl_ctl
+        self.kl_estimator = KLEstimator(config.kl_estimator)
 
         self.adv_norm = Normalization(config.adv_norm) if config.adv_norm else None
         self.reward_norm = (
@@ -110,7 +115,7 @@ def compute_advantages(self, data: Dict[str, Any]) -> None:
         attn_mask = data["attention_mask"]
         seqlens = attn_mask.sum(-1).long()
         seq_no_eos_mask = seqlens == attn_mask.shape[1]
-        rewards = -self.kl_ctl * (old_logp - ref_logp)
+        rewards = -self.kl_ctl * self.kl_estimator(old_logp, ref_logp)
         kl_rewards = rewards.clone()
         # KL rewards at the next token after eos is zero.
         rewards[batch_indices, seqlens - 1] = 0
diff --git a/areal/utils/data.py b/areal/utils/data.py
@@ -1301,3 +1301,64 @@ def _compute_std(
         if factor.item() == 0:
             return torch.ones_like(x_sum_sq)
         return (x_sum_sq / factor).sqrt()
+
+
+class KLEstimator:
+    """
+    KL divergence estimator, supports k1, k2 and k3.
+    """
+
+    def __init__(self, kl_estimator: str = "k1", apply_clamp: bool = True):
+        self.kl_estimator = kl_estimator
+        if kl_estimator not in ["k1", "k2", "k3"]:
+            raise ValueError(
+                f"Invalid KL estimator: {kl_estimator}. Valid choices: k1, k2, k3"
+            )
+        self.apply_clamp = apply_clamp
+
+    def __call__(
+        self, log_probs: torch.Tensor, log_probs_base: torch.Tensor
+    ) -> torch.Tensor:
+        return self._compute_approx_kl(
+            log_probs, log_probs_base, self.kl_estimator, self.apply_clamp
+        )
+
+    # adapted from https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/models/utils.py#L7
+    @staticmethod
+    def _compute_approx_kl(
+        log_probs: torch.Tensor,
+        log_probs_base: torch.Tensor,
+        kl_estimator: str = "k1",
+        apply_clamp: bool = True,
+    ) -> torch.Tensor:
+        """
+        Compute the approximate KL divergence between two distributions.
+        Schulman blog: http://joschu.net/blog/kl-approx.html
+
+        Args:
+            log_probs: Log probabilities of the new distribution.
+            log_probs_base: Log probabilities of the base distribution.
+        """
+
+        if kl_estimator == "k1":
+            log_ratio = log_probs.float() - log_probs_base.float()
+
+        # The k2 estimator is the non negative kl approximation in
+        # http://joschu.net/blog/kl-approx.html
+        # The k2_loss is approximately equivalent to the
+        # one-step KL divergence penalty with the k1 estimator
+        # used in https://arxiv.org/pdf/2310.10505.
+        if kl_estimator == "k2":
+            log_ratio = log_probs.float() - log_probs_base.float()
+            log_ratio = log_ratio**2 / 2.0
+
+        # The k3 estimator is the non negative kl approximation in
+        # http://joschu.net/blog/kl-approx.html
+        if kl_estimator == "k3":
+            log_ratio = log_probs.float() - log_probs_base.float()
+            log_ratio = -log_ratio
+            log_ratio = log_ratio.exp() - 1 - log_ratio
+
+        if apply_clamp:
+            log_ratio = log_ratio.clamp(min=-10, max=10)
+        return log_ratio
diff --git a/examples/math/gsm8k_reinforce.yaml b/examples/math/gsm8k_reinforce.yaml
@@ -0,0 +1,150 @@
+experiment_name: gsm8k-reinforce-plus-plus
+trial_name: trial0
+
+seed: 1
+total_train_epochs: 10
+tokenizer_path: ${actor.path}
+async_training: true
+
+cluster:
+  n_nodes: 1
+  n_gpus_per_node: 8
+  fileroot: /tmp/areal/experiments
+  name_resolve:
+    type: nfs
+    nfs_record_root: /tmp/areal/name_resolve
+
+allocation_mode: sglang.d4p1t1+d4p1t1
+
+rollout:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  max_concurrent_rollouts: 256
+  queue_size: null
+  consumer_batch_size: ${train_dataset.batch_size}
+  max_head_offpolicyness: 2
+  enable_rollout_tracing: false
+
+gconfig:
+  n_samples: 4
+  min_new_tokens: 0
+  max_new_tokens: 1024
+  greedy: false
+  temperature: 1.0
+
+actor:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  path: Qwen/Qwen2.5-1.5B-Instruct
+  init_from_scratch: false
+  disable_dropout: true
+  gradient_checkpointing: false
+  dtype: bfloat16
+  mb_spec:
+    max_tokens_per_mb: 10240
+  optimizer:
+    type: adam
+    lr: 1.70e-5
+    weight_decay: 0.017
+    beta1: 0.9
+    beta2: 0.999
+    eps: 1e-8
+    lr_scheduler_type: constant
+    gradient_clipping: 1.0
+    warmup_steps_proportion: 0.001
+  backend: fsdp
+  group_size: ${gconfig.n_samples}
+  eps_clip: 0.4
+  temperature: ${gconfig.temperature}
+  reward_scaling: 10.0
+  reward_bias: -0.5
+  kl_ctl: 0.001
+  kl_estimator: k1
+  ppo_n_minibatches: 1
+  recompute_logprob: true
+  use_decoupled_loss: true
+  behav_imp_weight_cap: 5.0
+  dynamic_sampling: false
+  adv_norm:
+    mean_level: batch
+    std_level: batch
+  max_new_tokens: ${gconfig.max_new_tokens}
+
+ref:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  path: ${actor.path}
+  init_from_scratch: false
+  disable_dropout: true
+  dtype: ${actor.dtype}
+  mb_spec:
+    max_tokens_per_mb: 10240
+  optimizer: null
+  backend: fsdp
+
+# SGLang
+sglang:
+  model_path: ${actor.path}
+  random_seed: ${seed}
+  skip_tokenizer_init: true
+  dtype: ${actor.dtype}
+  max_running_requests: null
+  context_length: 32768
+  mem_fraction_static: 0.8
+
+# datasets
+train_dataset:
+  batch_size: 256
+  shuffle: true
+  pin_memory: true
+  num_workers: 4
+  path: openai/gsm8k
+  type: rl
+  max_length: 1024
+
+valid_dataset:
+  batch_size: 256
+  shuffle: true
+  pin_memory: true
+  num_workers: 4
+  path: openai/gsm8k
+  type: rl
+
+# Utilities
+saver:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  freq_epochs: 1
+  freq_steps: null
+  freq_secs: null
+
+recover:
+  mode: disabled
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  freq_epochs: 1
+  freq_steps: null
+  freq_secs: 3600
+
+evaluator:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  freq_epochs: 1
+  freq_steps: null
+  freq_secs: null
+
+stats_logger:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  wandb:
+    mode: disabled
+
+launcher:
+  inference_server_cpus_per_gpu: 4
+  inference_server_mem_per_gpu: 32768
+  trainer_cpus_per_gpu: 4
+  trainer_mem_per_gpu: 32768
diff --git a/examples/math/gsm8k_reinforce_baseline.yaml b/examples/math/gsm8k_reinforce_baseline.yaml