huggingface · LeonEricsson · Feb 13, 2026 · Feb 13, 2026 · Feb 15, 2026 · Feb 17, 2026
diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md
@@ -592,6 +592,37 @@ training_args = GRPOConfig(
 )
 ```
 
+
+### Rethinking the Trust Region in LLM Reinforcement Learning
+
+**📜 Paper**: https://huggingface.co/papers/2602.04879
+
+DPPO replaces PPO/GRPO's heuristic ratio-clipping with a principled trust region based on direct policy divergence estimates. PPO-style clipping masks tokens based on the probability ratio π/μ, which over-penalizes low-probability tokens and under-penalizes high-probability ones. DPPO instead masks based on direct approximations of policy divergence (TV or KL), ensuring updates stay within a theoretically grounded trust region. Four divergence approximations are supported: `binary_tv`, `binary_kl`, `topk_tv`, and `topk_kl`.
+
+```python
+from trl.experimental.dppo import DPPOConfig, DPPOTrainer
+
+training_args = DPPOConfig(
+    divergence_type="binary_kl",  # divergence approximation (Section 3.2 of the paper)
+    divergence_topk=20,  # K for top-K divergence modes (Section 3.2 of the paper)
+    epsilon=0.2,  # δ_low threshold (Section 3.2 of the paper)
+    epsilon_high=0.28,  # δ_high threshold (Section 3.2 of the paper)
+    clip_ratio_c=3.0,  # IS ratio upper bound C (Section 3.2 of the paper)
+    beta=0.0,  # KL regularization coefficient
+    use_vllm=True,
+)
+
+trainer = DPPOTrainer(
+    model="your-model",
+    reward_funcs=[...],
+    args=training_args,
+    train_dataset=dataset,
+)
+trainer.train()
+```
+
+The official code [sail-sg/Stable-RL](https://github.com/sail-sg/Stable-RL)
+
 ## Direct Policy Optimization
 
 Papers relating to the [`DPOTrainer`]

diff --git a/tests/experimental/test_dppo_trainer.py b/tests/experimental/test_dppo_trainer.py
@@ -0,0 +1,188 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from datasets import load_dataset
+
+from trl.experimental.dppo import DPPOConfig, DPPOTrainer
+
+from ..testing_utils import TrlTestCase
+
+
+class TestDPPODivergenceMask:
+    """Unit tests for _compute_divergence_mask with synthetic inputs."""
+
+    def _make_trainer(self, divergence_type="binary_tv", epsilon=0.2, epsilon_high=0.28):
+        """Create a minimal DPPOTrainer-like object with just the attributes needed for _compute_divergence_mask."""
+
+        class Stub:
+            pass
+
+        stub = Stub()
+        stub.divergence_type = divergence_type
+        stub.epsilon_low = epsilon
+        stub.epsilon_high = epsilon_high
+        return stub
+
+    def test_binary_tv_no_masking_within_threshold(self):
+        stub = self._make_trainer("binary_tv", epsilon=0.2, epsilon_high=0.28)
+        # Policies are very close — no tokens should be masked
+        sampling_logps = torch.log(torch.tensor([[0.5, 0.3, 0.7]]))
+        current_logps = torch.log(torch.tensor([[0.51, 0.29, 0.71]]))
+        advantages = torch.tensor([[1.0]])
+        completion_mask = torch.ones(1, 3)
+
+        mask = DPPOTrainer._compute_divergence_mask(stub, current_logps, sampling_logps, advantages, completion_mask)
+        assert mask.shape == (1, 3)
+        assert (mask == 1.0).all()
+
+    def test_binary_tv_masks_positive_advantage_high_divergence(self):
+        stub = self._make_trainer("binary_tv", epsilon=0.01, epsilon_high=0.01)
+        # π much higher than μ, positive advantage → should be masked (invalid_pos)
+        sampling_logps = torch.log(torch.tensor([[0.1]]))
+        current_logps = torch.log(torch.tensor([[0.5]]))
+        advantages = torch.tensor([[1.0]])
+        completion_mask = torch.ones(1, 1)
+
+        mask = DPPOTrainer._compute_divergence_mask(stub, current_logps, sampling_logps, advantages, completion_mask)
+        assert mask.item() == 0.0
+
+    def test_binary_tv_masks_negative_advantage_low_divergence(self):
+        stub = self._make_trainer("binary_tv", epsilon=0.01, epsilon_high=0.01)
+        # π much lower than μ, negative advantage → should be masked (invalid_neg)
+        sampling_logps = torch.log(torch.tensor([[0.5]]))
+        current_logps = torch.log(torch.tensor([[0.1]]))
+        advantages = torch.tensor([[-1.0]])
+        completion_mask = torch.ones(1, 1)
+
+        mask = DPPOTrainer._compute_divergence_mask(stub, current_logps, sampling_logps, advantages, completion_mask)
+        assert mask.item() == 0.0
+
+    def test_binary_tv_respects_completion_mask(self):
+        stub = self._make_trainer("binary_tv", epsilon=0.01, epsilon_high=0.01)
+        # Even though divergence is huge, padding tokens stay 0
+        sampling_logps = torch.log(torch.tensor([[0.1, 0.5]]))
+        current_logps = torch.log(torch.tensor([[0.9, 0.9]]))
+        advantages = torch.tensor([[1.0]])
+        completion_mask = torch.tensor([[1.0, 0.0]])
+
+        mask = DPPOTrainer._compute_divergence_mask(stub, current_logps, sampling_logps, advantages, completion_mask)
+        assert mask[0, 1].item() == 0.0
+
+    def test_topk_tv_requires_topk_inputs(self):
+        stub = self._make_trainer("topk_tv")
+        B, T, K = 1, 2, 4
+        sampling_logps = torch.log(torch.full((B, T), 0.3))
+        current_logps = torch.log(torch.full((B, T), 0.31))
+        advantages = torch.tensor([[1.0]])
+        completion_mask = torch.ones(B, T)
+
+        # Build top-K distributions that are nearly identical
+        topk_probs = torch.softmax(torch.randn(B, T, K), dim=-1)
+        sampling_topk_logps = torch.log(topk_probs)
+        current_topk_logps = torch.log(topk_probs + 0.001)
+
+        mask = DPPOTrainer._compute_divergence_mask(
+            stub,
+            current_logps,
+            sampling_logps,
+            advantages,
+            completion_mask,
+            current_topk_logps=current_topk_logps,
+            sampling_topk_logps=sampling_topk_logps,
+        )
+        assert mask.shape == (B, T)
+        assert (mask == 1.0).all()
+
+
+@pytest.mark.low_priority
+class TestDPPOTrainer(TrlTestCase):
+    @pytest.mark.parametrize("divergence_type", ["binary_tv", "binary_kl"])
+    def test_training_binary(self, divergence_type):
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+
+        config = DPPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,
+            per_device_train_batch_size=3,
+            num_generations=3,
+            max_completion_length=8,
+            divergence_type=divergence_type,
+            report_to="none",
+        )
+        trainer = DPPOTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=config,
+            train_dataset=dataset,
+        )
+
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
+    def test_training_with_custom_reward_func(self):
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+
+        def dummy_reward(completions, **kwargs):
+            return [float(len(c)) for c in completions]
+
+        config = DPPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,
+            per_device_train_batch_size=3,
+            num_generations=3,
+            max_completion_length=8,
+            report_to="none",
+        )
+        trainer = DPPOTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+            reward_funcs=dummy_reward,
+            args=config,
+            train_dataset=dataset,
+        )
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+
+    @pytest.mark.parametrize("config_name", ["standard_prompt_only", "conversational_prompt_only"])
+    def test_training_conversational(self, config_name):
+        dataset = load_dataset("trl-internal-testing/zen", config_name, split="train")
+
+        config = DPPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,
+            per_device_train_batch_size=3,
+            num_generations=3,
+            max_completion_length=8,
+            report_to="none",
+        )
+        trainer = DPPOTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=config,
+            train_dataset=dataset,
+        )
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
diff --git a/tests/test_vllm_client_server.py b/tests/test_vllm_client_server.py
@@ -20,6 +20,7 @@
 from transformers.testing_utils import torch_device
 
 from trl.generation.vllm_client import VLLMClient
+from trl.generation.vllm_generation import extract_logprobs
 from trl.import_utils import is_vllm_available
 from trl.scripts.vllm_serve import chunk_list
 
@@ -221,13 +222,15 @@ def test_logprobs_match_with_non_default_sampling(self):
         top_p = 0.9
         max_tokens = 8
         seed = 1234
+        num_logprobs = 5
 
         server_outputs = self.client.generate(
             prompts,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
             top_p=top_p,
             max_tokens=max_tokens,
+            logprobs=num_logprobs,
             generation_kwargs={"seed": seed},
         )
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@@ -244,27 +247,51 @@ def test_logprobs_match_with_non_default_sampling(self):
             repetition_penalty=repetition_penalty,
             top_p=top_p,
             max_tokens=max_tokens,
-            logprobs=0,  # this is what's used in practice to get the logprobs of generated tokens
+            logprobs=num_logprobs,
             seed=seed,
         )
         colocate_outputs = llm.generate(prompts, sampling_params=sampling_params, use_tqdm=False)
         colocate_prompt_ids = [output.prompt_token_ids for output in colocate_outputs]
         colocate_completion_ids = [
             list(output.token_ids) for outputs in colocate_outputs for output in outputs.outputs
         ]
-        colocate_logprobs = [
-            [next(iter(logprob.values())).logprob for logprob in output.logprobs]
-            for outputs in colocate_outputs
-            for output in outputs.outputs
-        ]
+        colocate_logprobs, colocate_logprob_token_ids = extract_logprobs(colocate_outputs)
 
+        # Generation correctness: prompt and completion IDs match between server and colocate
         assert server_outputs["prompt_ids"] == colocate_prompt_ids
         assert server_outputs["completion_ids"] == colocate_completion_ids
+
         server_logprobs = server_outputs["logprobs"]
-        assert len(server_logprobs) == len(colocate_logprobs)
+        server_logprob_token_ids = server_outputs["logprob_token_ids"]
+
+        # Shape: both should be (num_sequences, seq_len, num_logprobs) with multiple logprobs per token
+        assert len(server_logprobs) == len(prompts)
+        assert len(server_logprob_token_ids) == len(prompts)
+        for seq_lps in server_logprobs:
+            for token_lps in seq_lps:
+                assert len(token_lps) > 1, "Expected multiple logprobs per token when logprobs > 0"
+
+        # Value correctness: server extraction matches colocate extraction via extract_logprobs
+        assert server_logprob_token_ids == colocate_logprob_token_ids
         for server_seq, colocate_seq in zip(server_logprobs, colocate_logprobs, strict=True):
             assert len(server_seq) == len(colocate_seq)
-            assert server_seq == pytest.approx(colocate_seq, rel=1e-6, abs=1e-6)
+            for server_token_lps, colocate_token_lps in zip(server_seq, colocate_seq, strict=True):
+                assert server_token_lps == pytest.approx(colocate_token_lps, rel=1e-6, abs=1e-6)
+
+        # Ordering: logprobs at each position should be sorted descending
+        for seq_lps in server_logprobs:
+            for token_lps in seq_lps:
+                assert token_lps == sorted(token_lps, reverse=True), "Logprobs should be sorted descending"
+
+        # Sampled token presence: the actual completion token should appear in the logprob token IDs
+        for seq_idx, (completion_seq, token_ids_seq) in enumerate(
+            zip(server_outputs["completion_ids"], server_logprob_token_ids, strict=True)
+        ):
+            for pos, (sampled_id, lp_ids) in enumerate(zip(completion_seq, token_ids_seq, strict=True)):
+                assert sampled_id in lp_ids, (
+                    f"Sampled token {sampled_id} not found in logprob token IDs {lp_ids} "
+                    f"at sequence {seq_idx}, position {pos}"
+                )
 
     @classmethod
     def teardown_class(cls):

diff --git a/trl/experimental/dppo/__init__.py b/trl/experimental/dppo/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .dppo_config import DPPOConfig
+from .dppo_trainer import DPPOTrainer