opendilab
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎ding/rl_utils/grpo.py‎
Lines changed: 75 additions & 0 deletions b/‎ding/rl_utils/grpo.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎ding/rl_utils/log_prob_utils.py‎
Lines changed: 87 additions & 0 deletions b/‎ding/rl_utils/log_prob_utils.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎ding/rl_utils/ppo.py‎
Lines changed: 29 additions & 14 deletions b/‎ding/rl_utils/ppo.py‎
Lines changed: 29 additions & 14 deletions
diff --git a/‎ding/rl_utils/rloo.py‎
Lines changed: 69 additions & 0 deletions b/‎ding/rl_utils/rloo.py‎
Lines changed: 69 additions & 0 deletions
@@ -1429,3 +1429,8 @@ collect_demo_data_config.py
 events.*
 
 evogym/*
+ding/example/*
+ding/framework/middleware/tests/wandb/
+ding/.style.yapf
+ding/format.sh
+ding/framework/middleware_v3/
@@ -0,0 +1,75 @@
+from typing import Tuple
+from collections import namedtuple
+import torch
+from .log_prob_utils import efficient_method, naive_method, less_efficient_method, LogProbFunction
+
+grpo_policy_data = namedtuple('grpo_policy_data', ['logit_new', 'logit_old', 'logit_ref', 'action', 'adv', 'weight'])
+grpo_info = namedtuple('grpo_info', ['approx_kl', 'clipfrac'])
+
+
+def grpo_policy_error(
+        data: namedtuple,
+        log_prob_fn: LogProbFunction = efficient_method,  # Method to calculate the log probabilities
+        clip_ratio: float = 0.2,
+        beta: float = 0.1  # Weight coefficient for KL divergence
+) -> Tuple[namedtuple, namedtuple]:
+    """
+        Overview:
+             Group Relative Policy Optimization(	arxiv:2402.03300) .
+        Arguments:
+            - data (:obj:`namedtuple`): the grpo input data with fields shown in ``grpo_policy_data``.
+            - clip_ratio (:obj:`float`): the ppo clip ratio for the constraint of policy update, defaults to 0.2.
+            - beta (:obj:`float`): weight coefficient for KL divergence regularization, defaults to 0.1.
+            - log_prob_fn (:obj:`LogProbFunction`): The method to calculate the log probabilities, \
+                  defaults to `efficient_method`.
+        Returns:
+            - loss (:obj:`torch.FloatTensor`): the rloo policy loss, a differentiable 0-dim tensor.
+            - grpo_info (:obj:`namedtuple`): the grpo optim information for monitoring, all of them are Python scalar.
+        Shapes:
+            - logit_new (:obj:`torch.FloatTensor`): :math:`(B, S, V)`, where B is batch size, S is sequence length, \
+                   and V is vocabulary size.
+            - logit_old (:obj:`torch.FloatTensor`): :math:`(B, S, V)`.
+            - logit_ref (:obj:`torch.FloatTensor`): :math:`(B, S, V)`.
+            - action (:obj:`torch.LongTensor`): :math:`(B, S)`.
+            - adv (:obj:`torch.FloatTensor`): :math:`(B, )`.
+            - weight (:obj:`torch.FloatTensor` or :obj:`None`): :math:`(B, S)`.
+            - policy_loss (:obj:`torch.FloatTensor`): :math:`()`, 0-dim tensor.
+            - mean_kl (:obj:`float`): mean KL divergence between current and reference policy.
+            - mean_ratio (:obj:`float`): mean probability ratio.
+            - mean_clipped (:obj:`float`): proportion of clipped probability ratios.
+        """
+
+    # Calculate log probabilities for selected token
+    per_token_logps = log_prob_fn(data.logit_new, data.action)
+    per_token_ref_logps = log_prob_fn(data.logit_ref, data.action)
+    per_token_old_logps = log_prob_fn(data.logit_old, data.action)
+
+    # Calculate KL divergence: exp(q-p) - (q-p) - 1,
+    # where p is current policy and q is reference policy
+    per_token_kl = (torch.exp(per_token_ref_logps - per_token_logps) - (per_token_ref_logps - per_token_logps) - 1)
+
+    # Calculate policy ratio
+    ratio = torch.exp(per_token_logps - per_token_old_logps)
+    ratio_clipped = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio)
+
+    # Calculate loss for each token
+    advantages = data.adv.unsqueeze(1)  # [B, 1]
+    per_token_loss_unclipped = ratio * advantages
+    per_token_loss_clipped = ratio_clipped * advantages
+    per_token_loss = -torch.min(per_token_loss_unclipped, per_token_loss_clipped)
+
+    # Add KL divergence regularization term
+    per_token_loss = per_token_loss + beta * per_token_kl
+
+    # Calculate average loss using weight mask
+    weight = data.weight if data.weight is not None \
+        else torch.ones_like(per_token_loss)
+    loss = ((per_token_loss * weight).sum(dim=1) / weight.sum(dim=1)).mean()
+
+    # Calculate additional metrics
+    with torch.no_grad():
+        approx_kl = (per_token_old_logps - per_token_logps).mean().item()
+        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
+        clipfrac = torch.as_tensor(clipped).float().mean().item()
+
+    return loss, grpo_info(approx_kl=approx_kl, clipfrac=clipfrac)
@@ -0,0 +1,87 @@
+from typing import List, Callable, Optional, Any
+import torch
+from torch import Tensor
+
+LogitsProcessor = Callable[[Tensor, Tensor], Tensor]
+
+
+def naive_method(logits: Tensor, index: Tensor) -> Tensor:
+    """Calculate per-token log probabilities using naive method.
+
+    Args:
+        logits: Token logits of shape [B, S, V] or [S, V] where:
+               B = batch size
+               S = sequence length
+               V = vocabulary size
+        index: Selected token indices of shape [B, S] or [S]
+
+    Returns:
+        Tensor: Log probabilities for selected tokens of shape [B, S] or [S]
+    """
+    # Calculate log probabilities for each token
+    log_prob_new: Tensor = torch.log_softmax(logits, dim=-1)
+    # Get log probabilities for selected actions
+    index = index.unsqueeze(-1)  # [B, S, 1] or [S, 1]
+    per_token_logps: Tensor = torch.gather(log_prob_new, -1, index).squeeze(-1)
+    return per_token_logps
+
+
+def efficient_method(logits: Tensor, index: Tensor) -> Tensor:
+    """Calculate per-token log probabilities efficiently.
+
+    Args:
+        logits: Token logits of shape [B, S, V] or [S, V] where:
+               B = batch size
+               S = sequence length
+               V = vocabulary size
+        index: Selected token indices of shape [B, S] or [S]
+
+    Returns:
+        Tensor: Log probabilities for selected tokens of shape [B, S] or [S]
+    """
+    if logits.dtype in [torch.float32, torch.float64]:
+        selected_logits: Tensor = torch.gather(logits, dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
+
+        # Loop to reduce peak mem consumption
+        logsumexp_values: Tensor = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits])
+
+        # log_softmax(x_i) = x_i - logsumexp(x)
+        per_token_logps: Tensor = selected_logits - logsumexp_values
+    else:
+        # logsumexp approach is unstable with bfloat16
+        per_token_logps: List[Tensor] = []
+
+        # Loop to reduce peak mem consumption
+        for row_logits, row_labels in zip(logits, index):  # Iterate over sequence length
+            row_logps: Tensor = torch.log_softmax(row_logits, dim=-1)
+            row_per_token_logps: Tensor = row_logps.gather(dim=-1, index=row_labels.unsqueeze(-1)).squeeze(-1)
+            per_token_logps.append(row_per_token_logps)
+
+        per_token_logps = torch.stack(per_token_logps)
+
+    return per_token_logps
+
+
+def less_efficient_method(logits: Tensor, index: Tensor) -> Tensor:
+    """Calculate per-token log probabilities using categorical distribution.
+
+    Args:
+        logits: Token logits of shape [B, S, V] or [S, V] where:
+               B = batch size
+               S = sequence length
+               V = vocabulary size
+        index: Selected token indices of shape [B, S] or [S]
+
+    Returns:
+        Tensor: Log probabilities for selected tokens of shape [B, S] or [S]
+    """
+    dist = torch.distributions.categorical.Categorical(logits=logits)
+    logp: Tensor = dist.log_prob(index)
+    return logp
+
+
+# 定义一个统一的类型
+LogProbFunction = Callable[[Tensor, Tensor], Tensor]
+
+# 导出所有方法
+__all__ = ['naive_method', 'efficient_method', 'less_efficient_method', 'LogProbFunction']
@@ -104,17 +104,21 @@ def ppo_error(
     return ppo_loss(policy_output.policy_loss, value_loss, policy_output.entropy_loss), policy_info
 
 
-def ppo_policy_error(data: namedtuple,
-                     clip_ratio: float = 0.2,
-                     dual_clip: Optional[float] = None) -> Tuple[namedtuple, namedtuple]:
-    '''
+def ppo_policy_error(
+        data: namedtuple,
+        clip_ratio: float = 0.2,
+        dual_clip: Optional[float] = None,
+        entropy_bonus: bool = True
+) -> Tuple[namedtuple, namedtuple]:
+    """
     Overview:
-        Get PPO policy loss
+        Get PPO policy loss (both for classical RL in control/video games and LLM/VLM RLHF).
     Arguments:
-        - data (:obj:`namedtuple`): ppo input data with fieids shown in ``ppo_policy_data``
-        - clip_ratio (:obj:`float`): clip value for ratio
-        - dual_clip (:obj:`float`): a parameter c mentioned in arXiv:1912.09729 Equ. 5, shoule be in [1, inf),\
-        defaults to 5.0, if you don't want to use it, set this parameter to None
+        - data (:obj:`namedtuple`): Ppo input data with fieids shown in ``ppo_policy_data``.
+        - clip_ratio (:obj:`float`): Clip value for ratio, defaults to 0.2.
+        - dual_clip (:obj:`float`): A parameter c mentioned in arXiv:1912.09729 Equ. 5, shoule be in [1, inf), \
+            defaults to 5.0, if you don't want to use it, set this parameter to None
+        - entropy_bonus (:obj:`bool`): Whether to use entropy bonus, defaults to True. LLM RLHF usually does not use it.
     Returns:
         - ppo_policy_loss (:obj:`namedtuple`): the ppo policy loss item, all of them are the differentiable 0-dim tensor
         - ppo_info (:obj:`namedtuple`): the ppo optim information for monitoring, all of them are Python scalar
@@ -136,18 +140,29 @@ def ppo_policy_error(data: namedtuple,
         >>>     weight=torch.ones(3),
         >>> )
         >>> loss, info = ppo_policy_error(data)
-    '''
+
+    .. note::
+        This function can be extended from `B` to more parallel dimensions, like `(B, S)`, where `S` is the
+        sequence length in LLM/VLM.
+
+    .. note::
+        For the action mask often used in LLM/VLM, users can set the `weight` to the action mask.
+    """
     logit_new, logit_old, action, adv, weight = data
     if weight is None:
         weight = torch.ones_like(adv)
     dist_new = torch.distributions.categorical.Categorical(logits=logit_new)
     dist_old = torch.distributions.categorical.Categorical(logits=logit_old)
     logp_new = dist_new.log_prob(action)
     logp_old = dist_old.log_prob(action)
-    dist_new_entropy = dist_new.entropy()
-    if dist_new_entropy.shape != weight.shape:
-        dist_new_entropy = dist_new.entropy().mean(dim=1)
-    entropy_loss = (dist_new_entropy * weight).mean()
+
+    if entropy_bonus:
+        dist_new_entropy = dist_new.entropy()
+        if dist_new_entropy.shape != weight.shape:  # for the multi-agent rl case
+            dist_new_entropy = dist_new.entropy().mean(dim=1)
+        entropy_loss = (dist_new_entropy * weight).mean()
+    else:
+        entropy_loss = torch.tensor(0.0)
     # policy_loss
     ratio = torch.exp(logp_new - logp_old)
     if ratio.shape != adv.shape:
 
@@ -0,0 +1,69 @@
+from typing import Tuple
+from collections import namedtuple
+import torch
+from .log_prob_utils import efficient_method, naive_method, less_efficient_method, LogProbFunction
+
+rloo_policy_data = namedtuple('rloo_policy_data', ['logit_new', 'logit_old', 'action', 'reward', 'weight'])
+rloo_info = namedtuple('rloo_info', ['approx_kl', 'clipfrac'])
+
+
+def rloo_policy_error(
+        data: namedtuple,
+        log_prob_fn: LogProbFunction = efficient_method,  # Method to calculate the log probabilities
+        clip_ratio: float = 0.2,
+) -> Tuple[namedtuple, namedtuple]:
+    """
+    Overview:
+        REINFORCE Leave-One-Out(arXiv:2402.14740)
+    Arguments:
+        - data (:obj:`namedtuple`): the rloo input data with fields shown in ``rloo_policy_data``.
+        - clip_ratio (:obj:`float`): the ppo clip ratio for the constraint of policy update, defaults to 0.2.
+        - log_prob_fn (:obj:`LogProbFunction`): The method to calculate the log probabilities, \
+             defaults to `efficient_method`.
+    Returns:
+        - loss (:obj:`torch.FloatTensor`): the rloo policy loss, a differentiable 0-dim tensor.
+        - rloo_info (:obj:`namedtuple`): the rloo optim information for monitoring, all of them are Python scalar.
+    Shapes:
+        - logit_new (:obj:`torch.FloatTensor`): :math:`(B, S, V)`, where B is batch size, S is sequence length,\
+              and V is vocabulary size.
+        - logit_old (:obj:`torch.FloatTensor`): :math:`(B, S, V)`.
+        - action (:obj:`torch.LongTensor`): :math:`(B, S)`.
+        - reward (:obj:`torch.FloatTensor`): :math:`(K, B)`, where K is the number of samples per prompt.
+        - weight (:obj:`torch.FloatTensor` or :obj:`None`): :math:`(B, S)`.
+        - policy_loss (:obj:`torch.FloatTensor`): :math:`()`, 0-dim tensor.
+        - mean_ratio (:obj:`float`): mean probability ratio.
+        - mean_clipped (:obj:`float`): proportion of clipped probability ratios.
+        - mean_advantage (:obj:`float`): mean advantage value.
+    """
+
+    # Calculate advantage of each action
+    rloo_k = data.reward.size(0)
+    baseline = (data.reward.sum(0) - data.reward) / (rloo_k - 1)
+    adv = data.reward - baseline
+    adv = adv.flatten()
+
+    # Get log probabilities for selected actions
+    per_token_logps = log_prob_fn(data.logit_new, data.action)
+    per_token_old_logps = log_prob_fn(data.logit_old, data.action)
+
+    # Calculate policy ratio
+    ratio = torch.exp(per_token_logps - per_token_old_logps)
+    ratio_clipped = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio)
+
+    # Calculate loss for each token
+    advantages = adv.unsqueeze(1)  # [B, 1]
+    per_token_loss_unclipped = ratio * advantages
+    per_token_loss_clipped = ratio_clipped * advantages
+    per_token_loss = -torch.min(per_token_loss_unclipped, per_token_loss_clipped)
+
+    # Calculate average loss using weight mask
+    weight = data.weight if data.weight is not None else (torch.ones_like(per_token_loss))
+    loss = ((per_token_loss * weight).sum(dim=1) / weight.sum(dim=1)).mean()
+
+    # Calculate additional metrics
+    with torch.no_grad():
+        approx_kl = (per_token_old_logps - per_token_logps).mean().item()
+        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
+        clipfrac = torch.as_tensor(clipped).float().mean().item()
+
+    return loss, rloo_info(approx_kl=approx_kl, clipfrac=clipfrac)