Fix style in DPPO docstrings (#5326)

albertvillanova · web-flow · commit 4fea6d13dbf6 · 2026-03-20T12:04:59.000-06:00
diff --git a/trl/experimental/dppo/dppo_config.py b/trl/experimental/dppo/dppo_config.py
@@ -23,8 +23,8 @@ class DPPOConfig(GRPOConfig):
     """
     Configuration class for DPPOTrainer.
 
-    DPPO (Divergence Proximal Policy Optimization) replaces PPO/GRPO's heuristic ratio-clipping with a principled
-    trust region based on direct policy divergence estimates.
+    DPPO (Divergence Proximal Policy Optimization) replaces PPO/GRPO's heuristic ratio-clipping with a principled trust
+    region based on direct policy divergence estimates.
 
     Paper: "Rethinking the Trust Region in LLM Reinforcement Learning" (arXiv:2602.04879)
 
@@ -42,13 +42,13 @@ class DPPOConfig(GRPOConfig):
 
         epsilon (`float`, inherited from GRPOConfig, default overridden to `0.15`):
             Divergence threshold δ_low. Tokens whose divergence exceeds this when the policy moves in the
-            advantage-decreasing direction are masked. The paper recommends 0.15 for TV divergence
-            and 0.05 for KL divergence.
+            advantage-decreasing direction are masked. The paper recommends 0.15 for TV divergence and 0.05 for KL
+            divergence.
 
         epsilon_high (`float`, inherited from GRPOConfig, default overridden to `0.15`):
             Divergence threshold δ_high. Tokens whose divergence exceeds this when the policy moves in the
-            advantage-increasing direction are masked. The paper recommends 0.15 for TV divergence
-            and 0.05 for KL divergence.
+            advantage-increasing direction are masked. The paper recommends 0.15 for TV divergence and 0.05 for KL
+            divergence.
     """
 
     divergence_type: Literal["binary_tv", "binary_kl", "topk_tv", "topk_kl"] = field(
diff --git a/trl/experimental/dppo/dppo_trainer.py b/trl/experimental/dppo/dppo_trainer.py
@@ -72,11 +72,10 @@ class DPPOTrainer(GRPOTrainer):
     """
     Trainer for Divergence Proximal Policy Optimization (DPPO).
 
-    DPPO replaces PPO/GRPO's heuristic ratio-clipping with a principled trust region based on direct policy
-    divergence estimates. PPO-style clipping masks tokens based on probability ratio π/μ, which over-penalizes
-    low-probability tokens and under-penalizes high-probability tokens. In contrast, DPPO masks based on
-    direct approximation of policy divergence (e.g TV or KL) ensuring updates stay within a theoretically
-    grounded trust region.
+    DPPO replaces PPO/GRPO's heuristic ratio-clipping with a principled trust region based on direct policy divergence
+    estimates. PPO-style clipping masks tokens based on probability ratio π/μ, which over-penalizes low-probability
+    tokens and under-penalizes high-probability tokens. In contrast, DPPO masks based on direct approximation of policy
+    divergence (e.g TV or KL) ensuring updates stay within a theoretically grounded trust region.
 
 
     Four divergence approximations are supported:
@@ -275,8 +274,8 @@ def _generate_single_turn(self, prompt_ids, images, multimodal_fields):
         """Generate completions, always extracting sampled token logprobs.
 
         Returns:
-            5-tuple of (prompt_ids, completion_ids, logprobs, topk_logprobs, topk_token_ids).
-            topk_logprobs and topk_token_ids are None when divergence_type is not topk.
+            5-tuple of (prompt_ids, completion_ids, logprobs, topk_logprobs, topk_token_ids). topk_logprobs and
+            topk_token_ids are None when divergence_type is not topk.
         """
         device = self.accelerator.device
         mode = "train" if self.model.training else "eval"
@@ -420,9 +419,9 @@ def _tool_call_loop(
     ):
         """Tool execution loop that also threads top-K logprob data alongside logprobs.
 
-        Mirrors GRPOTrainer._tool_call_loop but additionally concatenates topk_logprobs and topk_token_ids
-        the same way logprobs is concatenated: real data for model-generated tokens, zero-padding for
-        tool-result tokens. When topk data is None (binary divergence), behaves identically to the parent.
+        Mirrors GRPOTrainer._tool_call_loop but additionally concatenates topk_logprobs and topk_token_ids the same way
+        logprobs is concatenated: real data for model-generated tokens, zero-padding for tool-result tokens. When topk
+        data is None (binary divergence), behaves identically to the parent.
         """
         K = self.divergence_topk
         has_topk = topk_logprobs is not None
@@ -620,8 +619,8 @@ def _generate(self, prompts: list):
         """Generate completions, handling tool calls, and thread top-K logprob data through the full pipeline.
 
         Returns:
-            9-tuple of (prompt_ids, completion_ids, tool_mask, completions, total_completion_tokens,
-            logprobs, topk_logprobs, topk_token_ids, extra_fields).
+            9-tuple of (prompt_ids, completion_ids, tool_mask, completions, total_completion_tokens, logprobs,
+            topk_logprobs, topk_token_ids, extra_fields).
         """
         device = self.accelerator.device
         mode = "train" if self.model.training else "eval"
@@ -768,8 +767,8 @@ def _get_per_token_logps_with_topk(
     ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor]:
         """Compute per-token log-probs, (optionally) entropies, and top-K log-probs in one forward pass.
 
-        Evaluates the current policy's log-probs at the rollout's top-K token IDs from the same
-        forward pass used for per_token_logps, avoiding an extra model call.
+        Evaluates the current policy's log-probs at the rollout's top-K token IDs from the same forward pass used for
+        per_token_logps, avoiding an extra model call.
 
         Args:
             topk_token_ids: Rollout policy's top-K token IDs, shape (B, T, K). The current policy's
@@ -1207,11 +1206,11 @@ def _compute_divergence_mask(
             completion_mask (`torch.Tensor`):
                 Binary mask of shape `(B, T)` where `1` indicates valid completion tokens and `0` padding.
             current_topk_logps (`torch.Tensor` or `None`):
-                Log-probabilities of the current policy at the rollout's top-K token IDs, shape `(B, T, K)`.
-                Required when `divergence_type` is `"topk_tv"` or `"topk_kl"`.
+                Log-probabilities of the current policy at the rollout's top-K token IDs, shape `(B, T, K)`. Required
+                when `divergence_type` is `"topk_tv"` or `"topk_kl"`.
             sampling_topk_logps (`torch.Tensor` or `None`):
-                Log-probabilities of the sampling policy at the rollout's top-K token IDs, shape `(B, T, K)`.
-                Required when `divergence_type` is `"topk_tv"` or `"topk_kl"`.
+                Log-probabilities of the sampling policy at the rollout's top-K token IDs, shape `(B, T, K)`. Required
+                when `divergence_type` is `"topk_tv"` or `"topk_kl"`.
 
         Returns:
             `torch.Tensor`: