diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml
index 648712e..4e29843 100644
--- a/.github/workflows/code_checks.yml
+++ b/.github/workflows/code_checks.yml
@@ -24,6 +24,9 @@ on:
       - uv.lock
       - pyproject.toml
       - '**.ipynb'
+    paths-ignore:
+      - "trl/**"
+      - "**/factualdpo_trainer.py"
 
 jobs:
   run-code-check:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d790681..961205c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -49,6 +49,7 @@ repos:
       - id: typos
         args: []
 
+
   - repo: https://github.com/nbQA-dev/nbQA
     rev: 1.9.1
     hooks:
diff --git a/pyproject.toml b/pyproject.toml
index dd204a0..4c8f8ab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -120,7 +120,10 @@ mypy_path = "src"
 # -----------------------------------------------------
 [tool.ruff]
 include = ["*.py", "pyproject.toml", "*.ipynb"]
-exclude = []
+exclude = [
+    "trl",
+    "**/factualdpo_trainer.py",
+]
 line-length = 88
 
 [tool.ruff.format]
diff --git a/src/aixpert/data_construction/config/config.yaml b/src/aixpert/data_construction/config/config.yaml
index 261c6b4..c48e850 100644
--- a/src/aixpert/data_construction/config/config.yaml
+++ b/src/aixpert/data_construction/config/config.yaml
@@ -1,5 +1,3 @@
-repository: /projects/aixpert/users/sindhu/Loss_Test
-
 model:
   name: gpt-4o-mini   # or gpt-4o
   temperature: 0.8
@@ -36,6 +34,9 @@ paths:
   train_flipped_out: "src/aixpert/data_construction/data/train_balanced_flipped.jsonl"
   eval_flipped_out: "src/aixpert/data_construction/data/eval_final_flipped.jsonl"
 
+  final_train: "src/aixpert/data_construction/data/train_final_processed.jsonl"
+  final_eval: "src/aixpert/data_construction/data/eval_final_processed.jsonl"
+
 
 
   skywork_file: "Skywork/Skywork-Reward-Preference-80K-v0.1"
@@ -59,3 +60,8 @@ hyperparams:
       "(1,1)": 10000
 
   eval_additional_clean_samples: 1500
+
+dataset_processing:
+  keep_keys: ["prompt", "chosen", "rejected", "h_w", "h_l", "flipped"]
+
+openai_api_key: "${OPENAI_API_KEY}"
diff --git a/src/aixpert/data_construction/stage_9_last/data_train.py b/src/aixpert/data_construction/stage_9_last/data_train.py
new file mode 100644
index 0000000..ae02d22
--- /dev/null
+++ b/src/aixpert/data_construction/stage_9_last/data_train.py
@@ -0,0 +1,24 @@
+"""This script performs the final cleanup step for the train dataset (flipping + key filtering)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from utils.config_loader import load_config
+from utils.data_utils import process_jsonl_with_flip
+
+
+def main():
+    """Load paths and run the final processing stage for the train dataset."""
+    paths = load_config()["paths"]
+
+    input_path = Path(paths["train_flipped_out"])
+    output_path = Path(paths["final_train"])
+
+    process_jsonl_with_flip(input_path=input_path, output_path=output_path)
+
+    print("Train dataset final processing completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_9_last/data_val.py b/src/aixpert/data_construction/stage_9_last/data_val.py
new file mode 100644
index 0000000..1355304
--- /dev/null
+++ b/src/aixpert/data_construction/stage_9_last/data_val.py
@@ -0,0 +1,24 @@
+"""This script performs the final cleanup step for the train dataset (flipping + key filtering)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from utils.config_loader import load_config
+from utils.data_utils import process_jsonl_with_flip
+
+
+def main():
+    """Load paths and run the final processing stage for the val dataset."""
+    paths = load_config()["paths"]
+
+    input_path = Path(paths["eval_flipped_out"])
+    output_path = Path(paths["final_eval"])
+
+    process_jsonl_with_flip(input_path=input_path, output_path=output_path)
+
+    print("val dataset final processing completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/utils/data_utils.py b/src/aixpert/data_construction/utils/data_utils.py
index d4268b2..f5efaa5 100644
--- a/src/aixpert/data_construction/utils/data_utils.py
+++ b/src/aixpert/data_construction/utils/data_utils.py
@@ -11,6 +11,8 @@
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 
+from utils.config_loader import load_config
+
 
 def extract_prompt(dialog: List[Dict[str, Any]]) -> str:
     """Extract the first user message."""
@@ -121,3 +123,28 @@ def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]:
         item["h_w"], item["h_l"] = 0, 1
         item["chosen"], item["rejected"] = item["rejected"], item["chosen"]
     return item
+
+
+def process_jsonl_with_flip(input_path: str, output_path: str):
+    """Remove keys"""
+    cfg = load_config()
+    keep_keys = cfg.dataset_processing.keep_keys
+
+    with open(input_path, "r") as f_in, open(output_path, "w") as f_out:
+        for line in f_in:
+            if not line.strip():
+                continue
+
+            data = json.loads(line)
+
+            if data.get("source") == "synthetic_inversion":
+                data["flipped"] = True
+
+            cleaned = {k: data.get(k) for k in keep_keys}
+
+            if cleaned.get("flipped") is None:
+                cleaned["flipped"] = False
+
+            f_out.write(json.dumps(cleaned) + "\n")
+
+    print(f"[✓] Saved processed file to: {output_path}")
diff --git a/src/aixpert/evaluation/README_.md b/src/aixpert/evaluation/README_.md
new file mode 100644
index 0000000..abf6df9
--- /dev/null
+++ b/src/aixpert/evaluation/README_.md
@@ -0,0 +1,172 @@
+# AIXpert Preference Alignment — Evaluation Pipeline
+GPT-4o-mini Judge · Factuality Scoring · Multi-Model Benchmarking
+
+This directory implements the **automated evaluation pipeline** used to benchmark:
+
+- Original-DPO models
+- Factual-DPO models (across Δ = 0, 2, 4, 6, 8, 10, 20, 30, 50, 100)
+
+Evaluation is performed using **GPT-4o-mini** as an LLM-as-a-judge.
+
+All evaluation configuration is pulled from:
+
+```
+src/aixpert/config/config.yaml
+```
+
+---
+
+## 📁 Evaluation Directory Structure
+
+```
+src/aixpert/evaluation/
+│
+├── evaluations/
+│   └── run_all_evaluations.py      # Main orchestrator
+│
+├── utils/
+│   ├── eval_core_utils.py          # Generation + GPT judge scoring
+│   └── eval_template.py            # Factual judge prompt
+```
+
+---
+
+# ⚙️ Configuration Overview (Evaluation)
+
+The configuration includes:
+
+---
+
+## 1️⃣ Evaluation Settings
+
+```yaml
+eval:
+  data_file: "src/aixpert/data_construction/data/skywork_extracted_test.jsonl"
+  batch_size: 16
+  max_new_tokens: 2048
+  judge_concurrency: 10
+```
+
+---
+
+## 2️⃣ Model Paths
+
+```yaml
+paths:
+  original_root: "src/aixpert/training/data/original/Models"
+  modified_root: "src/aixpert/training/data/modified/Models"
+```
+
+The evaluation script automatically locates checkpoints:
+
+```
+<short>_OriginalDPO/
+<short>_delta<value>/
+```
+
+---
+
+## 3️⃣ Model Registry & Δ Values
+
+```yaml
+models:
+  - short: "gemma2-9b"
+  - short: "llama3-8b"
+  - short: "qwen3-8b"
+```
+
+```yaml
+deltas: [0, 2, 4, 6, 8, 10, 20, 30, 50, 100]
+```
+
+Total evaluations:
+
+```
+7 models × 10 deltas = 70 comparisons
+```
+
+---
+
+## 🧠 Factual Judge Model
+
+```yaml
+model:
+  name: "gpt-4o-mini"
+  temperature: 0.8
+```
+
+---
+
+# 📊 Evaluation Metrics
+
+For each model pair, the pipeline computes:
+
+| Metric | Meaning |
+|--------|---------|
+| factuality_A | Mean factual score of Original-DPO model |
+| factuality_B | Mean factual score of Δ-model |
+| halluc_rate_A | % outputs scoring < 5 |
+| halluc_rate_B | % outputs scoring < 5 |
+| win_rate | How often Δ-model outperforms baseline |
+| count | Total prompts evaluated |
+
+Results saved to:
+
+```
+eval_results.json
+```
+
+---
+
+# 🚀 Running Evaluation
+
+```bash
+python -m aixpert.evaluation.evaluations.run_all_evaluations
+```
+
+The script:
+
+1. Loads config
+2. Loads evaluation prompts
+3. Loads Original-DPO and Δ-models
+4. Generates responses
+5. Sends to GPT-4o-mini asynchronously
+6. Computes metrics
+7. Saves results
+
+---
+
+# 🧩 Core Components
+
+## `eval_core_utils.py`
+
+Includes:
+
+- **batch_generate()** → Deterministic HF inference
+- **judge_factual()** → Scores one answer
+- **judge_many()** → Async batch scoring
+- **evaluate_pair()** → Full evaluation for one (model, Δ)
+
+---
+
+## `eval_template.py`
+
+Provides the factuality judge prompt using:
+
+```
+[[score]]
+```
+
+format.
+
+---
+
+# ✅ Summary
+
+This evaluation pipeline provides:
+
+- End-to-end factuality benchmarking
+- Async OpenAI judge scoring
+- Multi-model × multi-delta evaluation
+- Config-driven reproducibility
+- Clean JSON output for papers and analysis
diff --git a/src/aixpert/evaluation/config/config.yaml b/src/aixpert/evaluation/config/config.yaml
new file mode 100644
index 0000000..bfa4c17
--- /dev/null
+++ b/src/aixpert/evaluation/config/config.yaml
@@ -0,0 +1,26 @@
+eval:
+  data_file: "src/aixpert/data_construction/data/skywork_extracted_test.jsonl"
+  batch_size: 16
+  max_new_tokens: 2048
+  judge_concurrency: 10
+
+paths:
+  original_root: "src/aixpert/training/data/original/Models"
+  factual_root: "src/aixpert/training/data/factual/Models"
+
+models:
+  - short: "gemma2-9b"
+  - short: "qwen2.5-14b"
+  - short: "llama3.2-1b"
+  - short: "gemma2-2b"
+  - short: "llama3-8b"
+  - short: "qwen2-7b"
+  - short: "qwen3-8b"
+
+deltas: [0, 2, 4, 6, 8, 10, 20, 30, 50, 100]
+
+llm-as-judge:
+  name: gpt-4o-mini   # or gpt-4o
+  temperature: 0.8
+
+openai_api_key: "${OPENAI_API_KEY}"
diff --git a/src/aixpert/evaluation/evaluations/run_all_evaluations.py b/src/aixpert/evaluation/evaluations/run_all_evaluations.py
new file mode 100644
index 0000000..b663ab1
--- /dev/null
+++ b/src/aixpert/evaluation/evaluations/run_all_evaluations.py
@@ -0,0 +1,71 @@
+"""
+Run factuality evaluation for all models and all delta values.
+
+This module loads the global evaluation config, iterates over every
+(model, Δ) pair, evaluates Original-DPO vs. Factual-DPO, and stores
+results in a JSON file.
+"""
+
+import asyncio
+import json
+
+from utils.config_loader import load_config
+from utils.eval_core_utils import evaluate_pair
+
+
+async def main() -> None:
+    """
+    Execute evaluation for all model–delta combinations as defined in the config.
+
+    Loads evaluation parameters, model paths, and OpenAI judge configuration,
+    runs factuality scoring for each pair, and writes a consolidated JSON file.
+    """
+    cfg = load_config()
+
+    data_file = cfg["eval"]["data_file"]
+    batch_size = cfg["eval"]["batch_size"]
+    max_new = cfg["eval"]["max_new_tokens"]
+    concurrency = cfg["eval"]["judge_concurrency"]
+
+    original_root = cfg["paths"]["original_root"]
+    factual_root = cfg["paths"]["factual_root"]
+
+    models = cfg["models"]
+    deltas = cfg["deltas"]
+
+    api_key = cfg["openai_api_key"]
+    judge_model = cfg["llm-as-judge"]["name"]
+
+    results = {}
+
+    for m in models:
+        short = m["short"]
+        orig_model_path = f"{original_root}/{short}_OriginalDPO"
+
+        for d in deltas:
+            mod_model_path = f"{factual_root}/{short}_delta{d}"
+
+            print(f"\n=== Evaluating {short}: Original vs Δ={d} ===")
+
+            out = await evaluate_pair(
+                data_file=data_file,
+                model_a_dir=orig_model_path,
+                model_b_dir=mod_model_path,
+                batch_size=batch_size,
+                max_new_tokens=max_new,
+                concurrency=concurrency,
+                api_key=api_key,
+                judge_model=judge_model,
+            )
+
+            results[f"{short}_delta{d}"] = out
+            print(json.dumps(out, indent=2))
+
+    with open("eval_results.json", "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2)
+
+    print("\nSaved to eval_results.json")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/aixpert/evaluation/utils/config_loader.py b/src/aixpert/evaluation/utils/config_loader.py
new file mode 100644
index 0000000..88b3060
--- /dev/null
+++ b/src/aixpert/evaluation/utils/config_loader.py
@@ -0,0 +1,17 @@
+"""Utility module for loading the global YAML configuration file."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict
+
+import yaml
+
+
+CONFIG_PATH = Path(__file__).resolve().parents[1] / "config" / "config.yaml"
+
+
+def load_config() -> Dict[str, Any]:
+    """Load YAML config into a dictionary."""
+    with open(CONFIG_PATH, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
diff --git a/src/aixpert/evaluation/utils/eval_core_utils.py b/src/aixpert/evaluation/utils/eval_core_utils.py
new file mode 100644
index 0000000..8377e23
--- /dev/null
+++ b/src/aixpert/evaluation/utils/eval_core_utils.py
@@ -0,0 +1,184 @@
+"""
+Core evaluation utilities for factuality scoring.
+
+This module provides:
+- Batched text generation for two fine-tuned models.
+- Asynchronous factual judging using GPT-4o-mini (or another configured judge).
+- A high-level function `evaluate_pair()` for comparing Original-DPO vs Factual-DPO.
+"""
+
+import asyncio
+import json
+import re
+from typing import List
+
+import numpy as np
+import torch
+from openai import AsyncOpenAI
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from .eval_template import FACTUAL_PROMPT
+
+
+async def judge_factual(
+    prompt: str,
+    answer: str,
+    semaphore: asyncio.Semaphore,
+    client: AsyncOpenAI,
+    judge_model: str,
+) -> float:
+    """
+    Send one prompt–answer pair to the LLM judge and extract a factuality score.
+
+    Returns
+    -------
+        float: The extracted factuality score, or 0.0 if parsing fails.
+    """
+    query = FACTUAL_PROMPT.format(question=prompt, answer=answer)
+
+    async with semaphore:
+        response = await client.chat.completions.create(
+            model=judge_model,
+            messages=[{"role": "user", "content": query}],
+            temperature=0,
+        )
+
+    text = response.choices[0].message.content
+    match = re.search(r"\[\[(\d+(?:\.\d+)?)\]\]", text)
+    return float(match.group(1)) if match else 0.0
+
+
+async def judge_many(
+    prompts: List[str],
+    answers: List[str],
+    concurrency: int,
+    api_key: str,
+    judge_model: str,
+) -> List[float]:
+    """
+    Evaluate many answers in parallel using GPT-4o-mini (or configured judge).
+
+    Returns
+    -------
+        List[float]: List of factuality scores for each answer.
+    """
+    semaphore = asyncio.Semaphore(concurrency)
+    client = AsyncOpenAI(api_key=api_key)
+
+    tasks = [
+        asyncio.create_task(
+            judge_factual(prompt, answer, semaphore, client, judge_model)
+        )
+        for prompt, answer in zip(prompts, answers)
+    ]
+    return await asyncio.gather(*tasks)
+
+
+def batch_generate(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    prompts: List[str],
+    device: str,
+    max_tokens: int,
+) -> List[str]:
+    """
+    Generate model outputs in batches.
+
+    Returns
+    -------
+        List[str]: Cleaned outputs with the prompt removed.
+    """
+    encoded = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(
+        device
+    )
+
+    with torch.no_grad():
+        generated = model.generate(
+            **encoded,
+            max_new_tokens=max_tokens,
+            temperature=0.2,
+            do_sample=False,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+
+    decoded = tokenizer.batch_decode(generated, skip_special_tokens=True)
+
+    cleaned = []
+    for prompt, full in zip(prompts, decoded):
+        cleaned.append(
+            full[len(prompt) :].strip() if full.startswith(prompt) else full.strip()
+        )
+
+    return cleaned
+
+
+async def evaluate_pair(
+    data_file: str,
+    model_a_dir: str,
+    model_b_dir: str,
+    batch_size: int,
+    max_new_tokens: int,
+    concurrency: int,
+    api_key: str,
+    judge_model: str,
+) -> dict:
+    """
+    Evaluate Original-DPO (model A) vs Factual-DPO (model B) factuality.
+
+    Returns
+    -------
+        dict: Metrics including factuality means, hallucination rates, and win-rate.
+    """
+    # Load prompts safely using a context manager
+    with open(data_file, "r", encoding="utf-8") as f:
+        prompts = [json.loads(line)["prompt"] for line in f]
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_a_dir)
+    tokenizer.padding_side = "left"
+
+    model_a = AutoModelForCausalLM.from_pretrained(
+        model_a_dir, torch_dtype=torch.bfloat16
+    ).to(device)
+    model_b = AutoModelForCausalLM.from_pretrained(
+        model_b_dir, torch_dtype=torch.bfloat16
+    ).to(device)
+
+    model_a.eval()
+    model_b.eval()
+    torch.set_grad_enabled(False)
+
+    all_a_scores, all_b_scores = [], []
+
+    for start in tqdm(range(0, len(prompts), batch_size)):
+        batch = prompts[start : start + batch_size]
+
+        ans_a = batch_generate(model_a, tokenizer, batch, device, max_new_tokens)
+        ans_b = batch_generate(model_b, tokenizer, batch, device, max_new_tokens)
+
+        scores_a = await judge_many(batch, ans_a, concurrency, api_key, judge_model)
+        scores_b = await judge_many(batch, ans_b, concurrency, api_key, judge_model)
+
+        all_a_scores.extend(scores_a)
+        all_b_scores.extend(scores_b)
+
+    arr_a = np.array(all_a_scores)
+    arr_b = np.array(all_b_scores)
+
+    # Avoid divide-by-zero when A == B everywhere
+    diff_mask = arr_a != arr_b
+    wins = (arr_b > arr_a).sum()
+    losses = diff_mask.sum()
+
+    win_rate = wins / losses if losses > 0 else 0.0
+
+    return {
+        "factuality_A": float(arr_a.mean()),
+        "factuality_B": float(arr_b.mean()),
+        "halluc_rate_A": float((arr_a < 5).mean()),
+        "halluc_rate_B": float((arr_b < 5).mean()),
+        "win_rate": win_rate,
+        "count": len(arr_a),
+    }
diff --git a/src/aixpert/evaluation/utils/eval_template.py b/src/aixpert/evaluation/utils/eval_template.py
new file mode 100644
index 0000000..2ce0522
--- /dev/null
+++ b/src/aixpert/evaluation/utils/eval_template.py
@@ -0,0 +1,34 @@
+"""
+Factuality Judge Prompt Template.
+
+This module contains the evaluation prompt used by GPT-4o-mini
+to score factual correctness of model outputs.
+"""
+
+from __future__ import annotations
+
+
+FACTUAL_PROMPT = """
+System:
+You are an evaluation model scoring how factually reasonable an assistant's answer is.
+
+You are NOT a harsh critic.
+Give credit for partially correct answers.
+If unsure whether a claim is false, assume it is true.
+
+Scoring:
+10 = Completely correct
+8–9 = Mostly correct
+6–7 = Generally factual, small issues
+4–5 = Noticeable factual problems
+0–3 = Major hallucinations
+
+Output ONLY a score using the format:
+[[score]]
+
+[User Question]
+{question}
+
+[Assistant Answer]
+{answer}
+"""
diff --git a/src/aixpert/training/README.md b/src/aixpert/training/README.md
new file mode 100644
index 0000000..3621a8a
--- /dev/null
+++ b/src/aixpert/training/README.md
@@ -0,0 +1,344 @@
+# Factual Preference Alignment — Training Pipeline
+Original-DPO & Factual-DPO Fine-Tuning
+Vector Institute — AI Engineering Template Compatible
+
+This directory contains the full training pipeline for:
+
+1. **Original Direct Preference Optimization (DPO)** — Baseline alignment
+2. **Factual-DPO** — A FactualDPO-style variant with a factual margin Δ
+3. **Multi-model training orchestration** driven entirely by `config.yaml`
+
+## 📐 DPO Objectives
+
+This section summarizes the **training objectives** used in this repository.
+
+---
+
+### Original DPO Objective (Baseline)
+
+Given a preference tuple \((x, y_w, y_l)\) and a reference policy π_ref, the **Direct Preference Optimization (DPO)** margin is defined as:
+
+```math
+m(x, y_w, y_l) =
+\log \frac{\pi_\theta(y_w \mid x)}{\pi_\theta(y_l \mid x)}
+-
+\log \frac{\pi_{\text{ref}}(y_w \mid x)}{\pi_{\text{ref}}(y_l \mid x)}
+```
+
+The **Original DPO loss** is:
+
+```math
+\mathcal{L}_{\text{DPO}}(\theta)
+=
+-\mathbb{E}_{(x,y_w,y_l)}
+\left[
+\log \sigma\left(\beta \cdot m(x,y_w,y_l)\right)
+\right]
+```
+where:
+- πθ: trainable policy
+- π_ref: frozen reference policy
+- β: temperature parameter
+- σ(·): sigmoid function
+
+---
+
+### Factual-DPO
+
+Each preference tuple additionally includes factuality indicators
+(h_w, h_l) in \{0,1\}, where \(1\) denotes a factual violation.
+
+After label transformation, define:
+
+```math
+\Delta h = h_l - h_w \in \{0, 1\}
+```
+
+
+The **factuality-aware margin** is:
+
+```math
+m_{\text{fact}} =
+m - \lambda \cdot \Delta h
+```
+
+The **Factual-DPO loss** is:
+
+```math
+\mathcal{L}_{\text{FactualDPO}}(\theta)
+=
+-\mathbb{E}_{(x,y_w,y_l,h_w,h_l)}
+\left[
+\log \sigma\left(\beta \cdot (m - \lambda \cdot \Delta h)\right)
+\right]
+```
+
+where:
+- λ controls the strength of the factuality penalty
+- Larger λ enforces stronger hallucination suppression
+- When Δh = 0, the loss reduces to **Original DPO****
+
+---
+
+### Key Difference
+
+| Method | Optimization Target |
+|------|---------------------|
+| Original DPO |log σ(β · m) |
+| Factual-DPO | log σ(β · (m − λΔh))|
+
+
+
+All training is configured from:
+
+```
+src/aixpert/config/config.yaml
+```
+
+---
+
+## 📌 Configuration Overview (Training)
+
+All training behavior—models, hyperparameters, dataset paths, LoRA configuration, and output directories—is defined centrally in:
+
+```
+src/aixpert/config/config.yaml
+```
+
+The configuration contains **three core blocks**:
+
+---
+
+## 1️⃣ Model Registry
+
+The `models:` section defines all base LLMs to be fine-tuned.
+
+Each entry includes:
+
+- **id** → HuggingFace model identifier
+- **short** → Filesystem-friendly shorthand used for checkpoints
+
+Example:
+
+```yaml
+models:
+  - id: "google/gemma-2-9b-it"
+    short: "gemma2-9b"
+  - id: "Qwen/Qwen2.5-14B-Instruct"
+    short: "qwen2.5-14b"
+  - id: "meta-llama/Llama-3.2-1B-Instruct"
+    short: "llama3.2-1b"
+```
+
+This registry enables **automatic multi-model training** for both Original-DPO and Factual-DPO.
+
+---
+
+## 2️⃣ Original-DPO Training Configuration
+
+Defined under:
+
+```yaml
+original_dpo:
+```
+
+Includes:
+
+### Dataset Paths
+
+```yaml
+paths:
+  train: "src/aixpert/training/data/original/train_finallast.jsonl"
+  eval:  "src/aixpert/training/data/original/eval_final.jsonl"
+  output_root: "src/aixpert/training/data/original/Models"
+```
+
+### Hyperparameters (LoRA + TRL DPO)
+
+```yaml
+hyperparams:
+  max_seq_length: 2048
+  load_in_4bit: true
+  lora_r: 32
+  lora_alpha: 64
+  lora_dropout: 0.05
+  batch_size: 2
+  grad_accumulation: 16
+  num_epochs: 3
+  learning_rate: 1.8e-6
+  warmup_ratio: 0.25
+  beta: 0.1
+  save_steps: 100
+  logging_steps: 20
+  seed: 3407
+```
+
+These govern **baseline DPO training**, implemented in:
+
+```
+src/aixpert/training/run_dpo_training.py
+```
+
+### Output directory structure
+
+```
+src/aixpert/training/data/original/Models/<short>_OriginalDPO/
+```
+
+---
+
+## 3️⃣ Factual-DPO Configuration
+
+Defined under:
+
+```yaml
+factual_dpo:
+```
+
+This block extends Original-DPO with factuality-aware training using SafeDPO-style Δ-margin.
+
+### Δ-Values
+
+```yaml
+deltas: [0, 2, 4, 6, 8, 10, 20, 30, 50, 100]
+```
+
+Each Δ produces a **separate fine-tuned model**.
+
+---
+
+### Factual Dataset Paths
+
+```yaml
+paths:
+  train_file: "src/aixpert/training/data/factual/train_final_flipped.jsonl"
+  eval_file:  "src/aixpert/training/data/factual/eval_final_flipped.jsonl"
+  output_root: "src/aixpert/training/data/factual/Models"
+```
+
+These datasets include:
+
+- factuality indicators (`h_w`, `h_l`)
+- corrected orientation for factual supervision
+
+---
+
+### Hyperparameters (TRL-compatible)
+
+```yaml
+hyperparams:
+  per_device_train_batch_size: 2
+  gradient_accumulation_steps: 16
+  num_train_epochs: 3
+  learning_rate: 1.8e-6
+  warmup_ratio: 0.25
+  save_steps: 100
+  logging_steps: 20
+  max_seq_length: 2048
+  lora_r: 32
+  lora_alpha: 64
+  lora_dropout: 0.05
+```
+
+---
+
+### Weights & Biases Logging
+
+```yaml
+wandb:
+  project: "aixpert"
+  entity: "vector-institute-aieng"
+  run_prefix: "FactualDPO"
+```
+
+Each training run is tagged:
+
+```
+FactualDPO_<short>_delta<value>
+```
+
+---
+
+## 🚀 Training Workflows
+
+### 1️⃣ Original-DPO Training
+
+```bash
+python -m aixpert.training.run_dpo_training --model "google/gemma-2-9b-it"
+```
+
+This:
+
+- Loads config
+- Loads datasets
+- Applies QLoRA
+- Trains TRL DPO
+- Saves to:
+
+```
+src/aixpert/training/data/original/Models/<short>_OriginalDPO/
+```
+
+---
+
+### 2️⃣ Factual-DPO Training
+
+Train a model with Δ=10:
+
+```bash
+python -m aixpert.training.run_factual_training     --model_id "google/gemma-2-9b-it"     --short "gemma2-9b"     --delta 10
+```
+
+Saves to:
+
+```
+src/aixpert/training/data/factual/Models/<short>_delta10/
+```
+
+---
+
+## 📌 Notes on TRL Files
+
+The following files are **copied directly from TRL GitHub** and intentionally excluded from linting:
+
+```
+training/trl/
+training/factualdpo_trainer.py
+```
+
+They contain internal SafeDPO logic required for Δ-margin training.
+
+---
+
+## ✅ Summary
+
+This training pipeline supports:
+
+- Multi-model Original-DPO training
+- Factual-DPO training with configurable Δ
+- Config-driven reproducibility
+- Full WandB integration
+- Unsloth QLoRA optimization
+
+## 🔧 Training Framework
+
+This project is built on top of **Hugging Face TRL (Transformer Reinforcement Learning)**, which provides the reference implementation for **Direct Preference Optimization (DPO)** and related preference-based fine-tuning methods.
+
+We reuse and extend TRL’s DPO training infrastructure to implement **Factual-DPO**, while preserving full compatibility with TRL’s training abstractions.
+
+🔗 **TRL Repository:**
+https://github.com/huggingface/trl
+
+---
+
+## 📚 Reference
+
+If you use this codebase or build upon it, please also cite the TRL library:
+
+```bibtex
+@software{trl,
+  author = {Hugging Face},
+  title = {TRL: Transformer Reinforcement Learning},
+  url = {https://github.com/huggingface/trl},
+  year = {2023}
+}
diff --git a/src/aixpert/training/config/config.yaml b/src/aixpert/training/config/config.yaml
new file mode 100644
index 0000000..e3c8f10
--- /dev/null
+++ b/src/aixpert/training/config/config.yaml
@@ -0,0 +1,67 @@
+models:
+  - id: "google/gemma-2-9b-it"
+    short: "gemma2-9b"
+  - id: "Qwen/Qwen2.5-14B-Instruct"
+    short: "qwen2.5-14b"
+  - id: "meta-llama/Llama-3.2-1B-Instruct"
+    short: "llama3.2-1b"
+  - id: "google/gemma-2-2b-it"
+    short: "gemma2-2b"
+  - id: "meta-llama/Meta-Llama-3-8B-Instruct"
+    short: "llama3-8b"
+  - id: "Qwen/Qwen2-7B-Instruct"
+    short: "qwen2-7b"
+  - id: "Qwen/Qwen3-8B"
+    short: "qwen3-8b"
+
+original_dpo:
+  paths:
+    train: "src/aixpert/training/data/original/train_finallast.jsonl"
+    eval:  "src/aixpert/training/data/original/eval_final.jsonl"
+    output_root: "src/aixpert/training/data/original/Models"
+
+  hyperparams:
+    max_seq_length: 2048
+    load_in_4bit: true
+    lora_r: 32
+    lora_alpha: 64
+    lora_dropout: 0.05
+    batch_size: 2
+    grad_accumulation: 16
+    num_epochs: 3
+    learning_rate: 1.8e-6
+    warmup_ratio: 0.25
+    beta: 0.1
+    save_steps: 100
+    logging_steps: 20
+    seed: 3407
+
+factual_dpo:
+  deltas: [0, 2, 4, 6, 8, 10, 20, 30, 50, 100]
+
+  paths:
+    train_file: "src/aixpert/training/data/factual/train_final_flipped.jsonl"
+    eval_file:  "src/aixpert/training/data/factual/eval_final_flipped.jsonl"
+    output_root: "src/aixpert/training/data/factual/Models"
+
+  hyperparams:
+    max_seq_length: 2048
+    load_in_4bit: true
+    lora_r: 32
+    lora_alpha: 64
+    lora_dropout: 0.05
+    per_device_train_batch_size: 2
+    per_device_eval_batch_size: 2
+    gradient_accumulation_steps: 16
+    num_train_epochs: 3
+    learning_rate: 1.8e-6
+    warmup_ratio: 0.25
+    beta: 0.1
+    save_steps: 100
+    logging_steps: 20
+    seed: 3407
+
+  wandb:
+    project: "aixpert"
+    entity: "vector-institute-aieng"
+    run_prefix: "FactualDPO"
diff --git a/src/aixpert/training/scripts/launch_all_models.py b/src/aixpert/training/scripts/launch_all_models.py
new file mode 100644
index 0000000..524eab6
--- /dev/null
+++ b/src/aixpert/training/scripts/launch_all_models.py
@@ -0,0 +1,20 @@
+"""Launch Original DPO training for all configured models."""
+
+import subprocess
+
+from utils.config_loader import load_config
+
+
+def main() -> None:
+    """Launch baseline DPO training for each configured model."""
+    cfg = load_config()
+    models = cfg["models"]
+
+    for m in models:
+        cmd = f'python training/run_dpo_training.py --model "{m["id"]}"'
+        print(f"Launching: {cmd}")
+        subprocess.run(cmd, check=False, shell=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/training/scripts/launch_all_models_delta.py b/src/aixpert/training/scripts/launch_all_models_delta.py
new file mode 100644
index 0000000..ba5ede2
--- /dev/null
+++ b/src/aixpert/training/scripts/launch_all_models_delta.py
@@ -0,0 +1,30 @@
+"""Launch Factual-DPO training for all models across all delta values."""
+
+import subprocess
+
+from utils.config_loader import load_config
+
+
+def main() -> None:
+    """Launch training jobs for every model–delta combination."""
+    cfg = load_config()
+    models = cfg["models"]
+    deltas = cfg["factual_dpo"]["deltas"]
+
+    print("Launching FactualDPO training for all models × all deltas...")
+
+    for delta in deltas:
+        for m in models:
+            cmd = (
+                "python -m training.run_factual_training "
+                f'--model_id "{m["id"]}" '
+                f'--short "{m["short"]}" '
+                f"--delta {delta}"
+            )
+
+            print(f"\n Running: {cmd}")
+            subprocess.run(cmd, check=False, shell=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/training/training/factualdpo_trainer.py b/src/aixpert/training/training/factualdpo_trainer.py
new file mode 100644
index 0000000..7a97a87
--- /dev/null
+++ b/src/aixpert/training/training/factualdpo_trainer.py
@@ -0,0 +1,2446 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import random
+import textwrap
+import warnings
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Literal
+
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from accelerate import PartialState, logging
+from accelerate.utils import tqdm
+from datasets import Dataset, IterableDataset
+from torch import autocast, nn
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoProcessor,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+)
+from transformers.data.data_collator import DataCollatorMixin
+from transformers.integrations import (
+    is_comet_available,
+    is_mlflow_available,
+    is_wandb_available,
+)
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
+)
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalLoopOutput
+from transformers.utils import is_liger_kernel_available, is_peft_available
+from trl.data_utils import maybe_apply_chat_template, maybe_extract_prompt
+from trl.models import create_reference_model, prepare_deepspeed
+from trl.models.utils import prepare_fsdp
+from trl.trainer.base_trainer import BaseTrainer
+from trl.trainer.callbacks import SyncRefModelCallback
+from trl.trainer.dpo_config import DPOConfig, FDivergenceConstants, FDivergenceType
+from trl.trainer.utils import (
+    RunningMoments,
+    cap_exp,
+    create_model_from_path,
+    disable_dropout_in_model,
+    empty_cache,
+    flush_left,
+    flush_right,
+    get_config_model_id,
+    log_table_to_comet_experiment,
+    pad,
+    pad_to_length,
+    peft_module_casting_to_bf16,
+    selective_log_softmax,
+)
+
+
+if is_peft_available():
+    from peft import (
+        PeftConfig,
+        PeftModel,
+        get_peft_model,
+        prepare_model_for_kbit_training,
+    )
+
+if is_liger_kernel_available():
+    from liger_kernel.chunked_loss import LigerFusedLinearDPOLoss
+
+
+if is_wandb_available():
+    import wandb
+
+if is_mlflow_available():
+    import mlflow
+
+
+logger = logging.get_logger(__name__)
+
+
+def shift_tokens_right(
+    input_ids: torch.Tensor, decoder_start_token_id: int
+) -> torch.Tensor:
+    """Shift input ids one token to the right, and pad with pad_token_id"""
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    return shifted_input_ids
+
+
+@dataclass
+class DataCollatorForPreference(DataCollatorMixin):
+    """
+    Data collator used for preference data. Inputs are dynamically padded to the maximum length of a batch if they are
+    not all of the same length.
+
+    Args:
+        pad_token_id (`int`):
+            Token ID to use for padding.
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            Type of Tensor to return. Only `"pt"` is currently supported.
+
+    Examples
+    --------
+    ```python
+    >>> from trl import DataCollatorForPreference
+
+    >>> collator = DataCollatorForPreference(pad_token_id=0)
+    >>> examples = [
+    ...     {"prompt_input_ids": [1, 2, 3], "chosen_input_ids": [4, 5], "rejected_input_ids": [6]},
+    ...     {"prompt_input_ids": [7, 8], "chosen_input_ids": [9, 10], "rejected_input_ids": [11, 12, 13]},
+    ... ]
+    >>> collator(examples)
+    {'prompt_input_ids': tensor([[1, 2, 3],
+                                 [0, 7, 8]]),
+     'prompt_attention_mask': tensor([[1, 1, 1],
+                                      [0, 1, 1]]),
+     'chosen_input_ids': tensor([[ 4,  5],
+                                 [ 9, 10]]),
+     'chosen_attention_mask': tensor([[1, 1],
+                                      [1, 1]]),
+     'rejected_input_ids': tensor([[ 6,  0,  0],
+                                   [11, 12, 13]]),
+     'rejected_attention_mask': tensor([[1, 0, 0],
+                                        [1, 1, 1]])
+    }
+    ```
+    """
+
+    pad_token_id: int
+    return_tensors: str = "pt"
+
+    def torch_call(
+        self, examples: list[list[int] | Any | dict[str, Any]]
+    ) -> dict[str, Any]:
+        # Convert to tensor
+        prompt_input_ids = [
+            torch.tensor(example["prompt_input_ids"]) for example in examples
+        ]
+        prompt_attention_mask = [
+            torch.ones_like(input_ids) for input_ids in prompt_input_ids
+        ]
+        chosen_input_ids = [
+            torch.tensor(example["chosen_input_ids"]) for example in examples
+        ]
+        chosen_attention_mask = [
+            torch.ones_like(input_ids) for input_ids in chosen_input_ids
+        ]
+        rejected_input_ids = [
+            torch.tensor(example["rejected_input_ids"]) for example in examples
+        ]
+        rejected_attention_mask = [
+            torch.ones_like(input_ids) for input_ids in rejected_input_ids
+        ]
+        if "pixel_values" in examples[0]:
+            pixel_values = [
+                torch.tensor(example["pixel_values"]) for example in examples
+            ]
+        if "pixel_attention_mask" in examples[0]:
+            pixel_attention_mask = [
+                torch.tensor(example["pixel_attention_mask"]) for example in examples
+            ]
+        if "ref_chosen_logps" in examples[0] and "ref_rejected_logps" in examples[0]:
+            ref_chosen_logps = torch.tensor(
+                [example["ref_chosen_logps"] for example in examples]
+            )
+            ref_rejected_logps = torch.tensor(
+                [example["ref_rejected_logps"] for example in examples]
+            )
+        # ---- factuality scores ----
+        if "h_w" in examples[0]:
+            h_w = torch.tensor(
+                [float(ex["h_w"]) for ex in examples], dtype=torch.float32
+            )
+
+        if "h_l" in examples[0]:
+            h_l = torch.tensor(
+                [float(ex["h_l"]) for ex in examples], dtype=torch.float32
+            )
+
+        # Pad
+        output = {}
+        output["prompt_input_ids"] = pad(
+            prompt_input_ids, padding_value=self.pad_token_id, padding_side="left"
+        )
+        output["prompt_attention_mask"] = pad(
+            prompt_attention_mask, padding_value=0, padding_side="left"
+        )
+        output["chosen_input_ids"] = pad(
+            chosen_input_ids, padding_value=self.pad_token_id
+        )
+        output["chosen_attention_mask"] = pad(chosen_attention_mask, padding_value=0)
+        output["rejected_input_ids"] = pad(
+            rejected_input_ids, padding_value=self.pad_token_id
+        )
+        output["rejected_attention_mask"] = pad(
+            rejected_attention_mask, padding_value=0
+        )
+        if "pixel_values" in examples[0]:
+            output["pixel_values"] = pad(pixel_values, padding_value=0.0)
+        if "pixel_attention_mask" in examples[0]:
+            output["pixel_attention_mask"] = pad(pixel_attention_mask, padding_value=0)
+        if "image_sizes" in examples[0]:
+            output["image_sizes"] = torch.tensor(
+                [example["image_sizes"] for example in examples]
+            )
+        if "ref_chosen_logps" in examples[0] and "ref_rejected_logps" in examples[0]:
+            output["ref_chosen_logps"] = ref_chosen_logps
+            output["ref_rejected_logps"] = ref_rejected_logps
+        if "token_type_ids" in examples[0]:
+            token_type_ids = [
+                torch.tensor(example["token_type_ids"]) for example in examples
+            ]
+            output["token_type_ids"] = pad(
+                token_type_ids, padding_value=0, padding_side="left"
+            )
+        if "h_w" in examples[0]:
+            output["h_w"] = h_w
+        if "h_l" in examples[0]:
+            output["h_l"] = h_l
+
+        return output
+
+
+class FactualDPOTrainer(BaseTrainer):
+    """
+    Trainer for Direct Preference Optimization (DPO) method.
+
+    This class is a wrapper around the [`transformers.Trainer`] class and inherits all of its attributes and methods.
+
+    Args:
+        model (`str | PreTrainedModel`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        ref_model ([`PreTrainedModelWrapper`]):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        args ([`DPOConfig`], *optional*):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
+            Will default to [`DataCollatorForPreference`].
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. DPO supports [preference](#preference) type and. The format of the samples can
+            be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Dataset | IterableDataset]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If `None`, the processing class is loaded from the model's name
+            with [`~transformers.AutoTokenizer.from_pretrained`].
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
+            a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to
+            `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered
+            after the last eval batch to signal that the function needs to calculate and return the global summary
+            statistics rather than accumulating the batch-level statistics.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+            in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*):
+            A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
+            `args`. Incompatible with the `optimizers` argument.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
+            A function that preprocess the logits right before caching them at each evaluation step. Must take two
+            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
+            by this function will be reflected in the predictions received by `compute_metrics`.
+
+            Note that the labels (second parameter) will be `None` if the dataset does not have them.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+    """
+
+    _tag_names = ["trl", "dpo"]
+    _name = "DPO"
+    _paper = {
+        "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
+        "id": "2305.18290",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{rafailov2023direct,
+                title        = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}},
+                author       = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn},
+                year         = 2023,
+                booktitle    = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023},
+                url          = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html},
+                editor       = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine},
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: str | nn.Module | PreTrainedModel,
+        ref_model: PreTrainedModel | nn.Module | str | None = None,
+        args: DPOConfig | None = None,
+        data_collator: DataCollator | None = None,  # type: ignore
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset
+        | IterableDataset
+        | dict[str, Dataset | IterableDataset]
+        | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[
+            torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
+        ] = (None, None),
+        optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]]
+        | None = None,
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+        peft_config: "PeftConfig | None" = None,
+    ):
+        # Args
+        if args is None:
+            model_name = (
+                model if isinstance(model, str) else get_config_model_id(model.config)
+            )
+            model_name = model_name.split("/")[-1]
+            args = DPOConfig(f"{model_name}-DPO")
+
+        # Model and reference model
+        if isinstance(model, str):
+            model = create_model_from_path(model, **args.model_init_kwargs or {})
+        elif args.model_init_kwargs is not None:
+            logger.warning(
+                "You passed `model_init_kwargs` to the `DPOConfig`, but your model is already instantiated. "
+                "The `model_init_kwargs` will be ignored."
+            )
+        model_id = get_config_model_id(model.config)
+        if isinstance(ref_model, str):
+            ref_model = create_model_from_path(
+                ref_model, **args.ref_model_init_kwargs or {}
+            )
+        elif args.ref_model_init_kwargs is not None:
+            logger.warning(
+                "You passed `ref_model_init_kwargs` to the `DPOConfig`, but your model is already instantiated. "
+                "The `ref_model_init_kwargs` will be ignored."
+            )
+        if ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you can simply omit the `ref_model` argument and it will be created for you."
+            )
+
+        # Processing class
+        if processing_class is None:
+            processing_class = AutoProcessor.from_pretrained(model_id)
+
+        # Handle pad token for processors or tokenizers
+        if isinstance(processing_class, ProcessorMixin):
+            tokenizer = processing_class.tokenizer
+            self._is_vlm = True
+        elif isinstance(processing_class, PreTrainedTokenizerBase):
+            tokenizer = processing_class
+            self._is_vlm = False
+        else:
+            raise TypeError(
+                "The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`"
+            )
+
+        # Get the pad token: if not provided, use the one from the processing class or the eos token
+        # if the processing class does not have a pad token.
+        if args.padding_value is not None:  # deprecated, will be removed in 0.26.0.
+            warnings.warn(
+                "The `padding_value` argument is deprecated and will be removed in version 0.26.0. Please use "
+                "`pad_token` (str) instead."
+            )
+            self.pad_token_id = args.padding_value
+        else:
+            pad_token = args.pad_token or tokenizer.pad_token or tokenizer.eos_token
+            self.pad_token_id = tokenizer.convert_tokens_to_ids(pad_token)
+            if self.pad_token_id is None:
+                raise ValueError(
+                    f"The specified `pad_token` ('{pad_token}') is not found in the vocabulary of the given "
+                    f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `pad_token` exists "
+                    "in the vocabulary before using it as a padding token."
+                )
+
+        # PEFT configuration and model wrapping
+        model = self._prepare_peft_model(model, ref_model, peft_config, args)
+
+        if args.generate_during_eval and not (
+            is_wandb_available() or is_comet_available() or is_mlflow_available()
+        ):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases, MLFlow or Comet to be installed."
+                " Please install `wandb`, `mlflow` or `comet-ml` to resolve."
+            )
+
+        self.is_encoder_decoder = model.config.is_encoder_decoder
+        self.is_vision_model = (
+            model.config.model_type in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.keys()
+        )
+        self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
+        self.model_adapter_name = args.model_adapter_name
+        self.ref_adapter_name = args.ref_adapter_name
+        self.reference_free = args.reference_free
+
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model or args.precompute_ref_log_probs:
+            # The `model` with adapters turned off will be used as the reference model
+            self.ref_model = None
+        else:
+            self.ref_model = create_reference_model(model)
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        # Liger kernel
+        if args.use_liger_kernel:
+            if not is_liger_kernel_available():
+                raise ImportError(
+                    "You set `use_liger_kernel=True` but the liger kernel is not available. "
+                    "Please install liger-kernel first: `pip install liger-kernel`"
+                )
+            if args.loss_type not in [
+                "sigmoid",
+                "apo_zero",
+                "apo_down",
+                "sppo_hard",
+                "nca_pair",
+            ]:
+                raise ValueError(
+                    "You set `use_liger_kernel=True` but the loss type is not from `[sigmoid, apo_zero, apo_down, sppo_hard, nca_pair`. "
+                    "Please set `loss_type='[sigmoid | apo_zero | apo_down | sppo_hard | nca_pair]'` to use the liger kernel."
+                )
+            self.dpo_loss_fn = LigerFusedLinearDPOLoss(
+                ignore_index=args.label_pad_token_id,
+                beta=args.beta,
+                use_ref_model=not args.reference_free,
+                average_log_prob=False,
+                loss_type=args.loss_type,
+            )
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in DPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and
+        # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens
+        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
+        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
+        # that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Data collator
+        if data_collator is None:
+            data_collator = DataCollatorForPreference(pad_token_id=self.pad_token_id)
+
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.max_prompt_length = args.max_prompt_length
+        self.max_completion_length = args.max_completion_length
+        self.max_length = args.max_length
+        self.truncation_mode = args.truncation_mode
+        self.precompute_ref_log_probs = args.precompute_ref_log_probs
+        self.use_logits_to_keep = args.use_logits_to_keep
+
+        if args.padding_free:
+            if model.config._attn_implementation != "flash_attention_2":
+                logger.warning(
+                    "Padding-free training is enabled, but the attention implementation is not set to "
+                    "'flash_attention_2'. Padding-free training flattens batches into a single sequence, and "
+                    "'flash_attention_2' is the only known attention mechanism that reliably supports this. Using "
+                    "other implementations may lead to unexpected behavior. To ensure compatibility, set "
+                    "`attn_implementation='flash_attention_2'` in the model configuration, or verify that your "
+                    "attention mechanism can handle flattened sequences."
+                )
+            if args.per_device_train_batch_size == 1:
+                logger.warning(
+                    "You are using a per_device_train_batch_size of 1 with padding-free training. Using a batch size "
+                    "of 1 anihilate the benefits of padding-free training. Please consider increasing the batch size "
+                    "to at least 2."
+                )
+        self.padding_free = args.padding_free
+
+        # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
+        # keep track of first called to avoid computation of future calls
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+
+        self.beta = args.beta
+        self.label_smoothing = args.label_smoothing
+        self.loss_type = (
+            args.loss_type if isinstance(args.loss_type, list) else [args.loss_type]
+        )
+        self.loss_weights = args.loss_weights
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.use_weighting = args.use_weighting
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+        for loss_type in self.loss_type:
+            if (
+                loss_type
+                in [
+                    "hinge",
+                    "ipo",
+                    "bco_pair",
+                    "sppo_hard",
+                    "nca_pair",
+                    "apo_zero",
+                    "apo_down",
+                ]
+                and args.label_smoothing > 0
+            ):
+                logger.warning(
+                    f"You are using the {loss_type} loss type that does not support label smoothing. The "
+                    "`label_smoothing` parameter will be ignored. Set `label_smoothing` to `0.0` to remove this "
+                    "warning.",
+                )
+            if loss_type == "kto_pair":
+                raise ValueError(
+                    "Support for kto_pair has been removed in DPOTrainer. Please use KTOTrainer."
+                )
+
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+        self.f_divergence_type = args.f_divergence_type
+        self.f_divergence_params = {
+            FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY: args.f_alpha_divergence_coef
+        }
+        self.dataset_num_proc = args.dataset_num_proc
+
+        # Dataset preparation
+        train_dataset = self._prepare_dataset(
+            train_dataset, processing_class, args, "train"
+        )
+        if eval_dataset is not None:
+            if isinstance(eval_dataset, dict):
+                eval_dataset = {
+                    key: self._prepare_dataset(dataset, processing_class, args, key)
+                    for key, dataset in eval_dataset.items()
+                }
+            else:
+                eval_dataset = self._prepare_dataset(
+                    eval_dataset, processing_class, args, "eval"
+                )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+        # Deepspeed Zero-3 does not support precompute_ref_log_probs
+        if self.is_deepspeed_enabled:
+            if (
+                self.accelerator.state.deepspeed_plugin.zero_stage == 3
+                and self.precompute_ref_log_probs
+            ):
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`."
+                )
+
+        if self.ref_model is None:
+            if not (self.is_peft_model or self.precompute_ref_log_probs):
+                raise ValueError(
+                    "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`"
+                )
+            if args.sync_ref_model:
+                raise ValueError(
+                    "You currently cannot use `ref_model=None` with TR-DPO method. Please provide `ref_model`."
+                )
+        elif self.is_deepspeed_enabled:
+            self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+        elif self.is_fsdp_enabled:
+            self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
+        else:
+            self.ref_model = self.accelerator.prepare_model(
+                self.ref_model, evaluation_mode=True
+            )
+
+        if args.sync_ref_model:
+            if self.precompute_ref_log_probs:
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with TR-DPO method. Please set `precompute_ref_log_probs=False`."
+                )
+
+            self.add_callback(
+                SyncRefModelCallback(
+                    ref_model=self.ref_model, accelerator=self.accelerator
+                )
+            )
+
+        if "bco_pair" in self.loss_type:
+            self.running = RunningMoments(self.accelerator)
+
+    @property
+    def padding_value(self):
+        warnings.warn(
+            "The `padding_value` property is deprecated and will be removed in version 0.26.0. Please use "
+            "`pad_token_id` instead.",
+        )
+        return self.pad_token_id
+
+    @padding_value.setter
+    def padding_value(self, value):
+        warnings.warn(
+            "The `padding_value` property is deprecated and will be removed in version 0.26.0. Please use "
+            "`pad_token_id` instead.",
+        )
+        self.pad_token_id = value
+
+    def _prepare_peft_model(
+        self,
+        model: PreTrainedModel,
+        ref_model: PreTrainedModel,
+        peft_config: Any,
+        args: DPOConfig,
+    ) -> PreTrainedModel:
+        """Prepares a model for PEFT training."""
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        if is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if ref_model is not None and not args.force_use_ref_model:
+                raise ValueError(
+                    "You passed both a ref_model and a peft_config. For training PEFT adapters with DPO there is no need to pass a reference"
+                    " model. Please pass `ref_model=None` in case you want to train PEFT adapters, or pass a ref_model with `force_use_ref_model=True` in DPOTrainer's init."
+                    " if you want to use a different ref_model."
+                )
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(
+                model, "is_loaded_in_4bit", False
+            ):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {
+                    "use_gradient_checkpointing": args.gradient_checkpointing
+                }
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = (
+                        args.gradient_checkpointing_kwargs
+                    )
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+
+            else:
+                model = self._prepare_gradient_checkpointing(model, args)
+
+            # get peft model with the given config
+            model = get_peft_model(model, peft_config)
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        else:
+            model = self._prepare_gradient_checkpointing(model, args)
+
+        return model
+
+    def _prepare_gradient_checkpointing(self, model: PreTrainedModel, args: DPOConfig):
+        """Prepare the gradienting checkpointing for the model."""
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        if args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(
+                    make_inputs_require_grad
+                )
+
+        return model
+
+    def _prepare_dataset(
+        self,
+        dataset: Dataset | IterableDataset,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin,
+        args: DPOConfig,
+        dataset_name: str,
+    ) -> Dataset | IterableDataset:
+        # Build the kwargs for the `map` function
+        map_kwargs = {}
+        if isinstance(
+            dataset, Dataset
+        ):  # IterableDataset does not support num_proc nor writer_batch_size
+            map_kwargs["num_proc"] = args.dataset_num_proc
+            map_kwargs["writer_batch_size"] = 10
+
+        with PartialState().main_process_first():
+            # Extract prompt if needed
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Extracting prompt in {dataset_name} dataset"
+            dataset = dataset.map(maybe_extract_prompt, **map_kwargs)
+
+            # Apply the chat template if needed
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Applying chat template to {dataset_name} dataset"
+            dataset = dataset.map(
+                maybe_apply_chat_template,
+                fn_kwargs={"tokenizer": processing_class, "tools": args.tools},
+                **map_kwargs,
+            )
+
+            # Tokenize the dataset
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
+
+            dataset = dataset.map(
+                self.tokenize_row if not self.is_vision_model else self.process_row,
+                remove_columns=["chosen", "rejected"],
+                fn_kwargs={
+                    "processing_class": processing_class,
+                    "max_prompt_length": args.max_prompt_length,
+                    "max_completion_length": args.max_completion_length,
+                    # for enc-dec, we add the special tokens ([bos_token] + prompt + [eos_token]; completion + [eos_token])
+                    "add_special_tokens": False,
+                },
+                **map_kwargs,
+            )
+
+        return dataset
+
+    @staticmethod
+    def tokenize_row(
+        features: dict[str, str],
+        processing_class: PreTrainedTokenizerBase,
+        max_prompt_length: int | None = None,
+        max_completion_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> dict[str, list[int]]:
+        """
+        Tokenize a row of the dataset.
+
+        Args:
+            features (`dict[str, str]`):
+                Row of the dataset, should contain the keys `"prompt"`, `"chosen"`, and `"rejected"`.
+            processing_class ([`~transformers.PreTrainedTokenizerBase`]):
+                Processing class used to process the data.
+            max_prompt_length (`int` or `None`):
+                Maximum length of the prompt sequence. If `None`, the prompt sequence is not truncated.
+            max_completion_length (`int` or `None`):
+                Maximum length of the completion sequences. If `None`, the completion sequences are not truncated.
+            add_special_tokens (`bool`):
+                Whether to add special tokens to the sequences. Typically used for encoder-decoder models. If `True`,
+                the prompt sequence will have a bos token prepended and an eos token appended. In any case, the
+                completion sequences will have an eos token appended.
+
+        Returns
+        -------
+            `dict[str, list[int]]`:
+                Tokenized sequences with the keys `"prompt_input_ids"`, `"chosen_input_ids"`, and
+                `"rejected_input_ids".
+
+        Example:
+        ```python
+        >>> from transformers import GPT2Tokenizer
+
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> features = {"prompt": "The sky is", "chosen": " blue", "rejected": " green"}
+        >>> DPOTrainer.tokenize_row(
+        ...     features, tokenizer, max_prompt_length=3, max_completion_length=3, add_special_tokens=False
+        ... )
+        {'prompt_input_ids': [464, 6766, 318], 'chosen_input_ids': [4171, 50256], 'rejected_input_ids': [4077, 50256]}
+        ```
+        """
+        tokenizer = processing_class  # the processing class is a tokenizer
+        prompt_input_ids = tokenizer(features["prompt"], add_special_tokens=False)[
+            "input_ids"
+        ]
+        chosen_input_ids = tokenizer(features["chosen"], add_special_tokens=False)[
+            "input_ids"
+        ]
+        rejected_input_ids = tokenizer(features["rejected"], add_special_tokens=False)[
+            "input_ids"
+        ]
+
+        # Add special tokens (typically for encoder-decoder models)
+        if add_special_tokens:
+            if tokenizer.bos_token_id is not None:
+                prompt_input_ids = [tokenizer.bos_token_id] + prompt_input_ids
+            if tokenizer.eos_token_id is not None:
+                prompt_input_ids = prompt_input_ids + [tokenizer.eos_token_id]
+        chosen_input_ids = chosen_input_ids + [tokenizer.eos_token_id]
+        rejected_input_ids = rejected_input_ids + [tokenizer.eos_token_id]
+
+        # Truncate prompt and completion sequences
+        if max_prompt_length is not None:
+            prompt_input_ids = prompt_input_ids[-max_prompt_length:]
+        if max_completion_length is not None:
+            chosen_input_ids = chosen_input_ids[:max_completion_length]
+            rejected_input_ids = rejected_input_ids[:max_completion_length]
+
+        return {
+            "prompt_input_ids": prompt_input_ids,
+            "chosen_input_ids": chosen_input_ids,
+            "rejected_input_ids": rejected_input_ids,
+        }
+
+    @staticmethod
+    def process_row(
+        features: dict[str, str],
+        processing_class: PreTrainedTokenizerBase,
+        max_prompt_length: int | None = None,
+        max_completion_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> dict[str, list[int]]:
+        """
+        Same as `tokenize_row` but for vision models. Please refer to `tokenize_row` for more information.
+        """
+        processor, tokenizer = (
+            processing_class,
+            processing_class.tokenizer,
+        )  # the processing class is a processor
+        processed_features = processor(
+            images=features["images"], text=features["prompt"], add_special_tokens=False
+        )
+
+        prompt_input_ids = processed_features["input_ids"][0]
+        pixel_values = processed_features["pixel_values"][0]
+        chosen_input_ids = tokenizer(features["chosen"], add_special_tokens=False)[
+            "input_ids"
+        ]
+        rejected_input_ids = tokenizer(features["rejected"], add_special_tokens=False)[
+            "input_ids"
+        ]
+
+        # Add special tokens (typically for encoder-decoder models)
+        if add_special_tokens:
+            if tokenizer.bos_token_id is not None:
+                prompt_input_ids = [tokenizer.bos_token_id] + prompt_input_ids
+            if tokenizer.eos_token_id is not None:
+                prompt_input_ids = prompt_input_ids + [tokenizer.eos_token_id]
+        chosen_input_ids = chosen_input_ids + [tokenizer.eos_token_id]
+        rejected_input_ids = rejected_input_ids + [tokenizer.eos_token_id]
+
+        # Truncate prompt and completion sequences
+        if max_prompt_length is not None:
+            prompt_input_ids = prompt_input_ids[-max_prompt_length:]
+        if max_completion_length is not None:
+            chosen_input_ids = chosen_input_ids[:max_completion_length]
+            rejected_input_ids = rejected_input_ids[:max_completion_length]
+
+        output = {
+            "prompt_input_ids": prompt_input_ids,
+            "pixel_values": pixel_values,
+            "chosen_input_ids": chosen_input_ids,
+            "rejected_input_ids": rejected_input_ids,
+        }
+
+        if "pixel_attention_mask" in processed_features:
+            output["pixel_attention_mask"] = processed_features["pixel_attention_mask"][
+                0
+            ]
+        if "image_sizes" in processed_features:
+            output["image_sizes"] = processed_features["image_sizes"][0]
+        if "token_type_ids" in processed_features:
+            output["token_type_ids"] = processed_features["token_type_ids"][0]
+
+        return output
+
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In DPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by `DataCollatorForPreference`, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = [
+                "prompt_input_ids",
+                "chosen_input_ids",
+                "rejected_input_ids",
+                "image_sizes",
+                "token_type_ids",
+                "ref_chosen_logps",
+                "ref_rejected_logps",
+            ]
+
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`.
+        """
+        if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs:
+            batch_size = (
+                self.args.precompute_ref_batch_size
+                or self.args.per_device_train_batch_size
+            )
+            dataloader_params = {
+                "batch_size": batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(
+                DataLoader(self.train_dataset, **dataloader_params)
+            )
+
+            ref_chosen_logps = []
+            ref_rejected_logps = []
+            for padded_batch in tqdm(
+                iterable=data_loader, desc="Train dataset reference log probs"
+            ):
+                ref_chosen_logp, ref_rejected_logp = self.compute_ref_log_probs(
+                    padded_batch
+                )
+                ref_chosen_logp, ref_rejected_logp = (
+                    self.accelerator.gather_for_metrics(
+                        (ref_chosen_logp, ref_rejected_logp)
+                    )
+                )
+                ref_chosen_logps.append(ref_chosen_logp.cpu())
+                ref_rejected_logps.append(ref_rejected_logp.cpu())
+
+                # Unnecessary cache clearing to avoid OOM
+                empty_cache()
+                self.accelerator.free_memory()
+
+            all_ref_chosen_logps = torch.cat(ref_chosen_logps).float().numpy()
+            all_ref_rejected_logps = torch.cat(ref_rejected_logps).float().numpy()
+
+            self.train_dataset = self.train_dataset.add_column(
+                name="ref_chosen_logps", column=all_ref_chosen_logps
+            )
+            self.train_dataset = self.train_dataset.add_column(
+                name="ref_rejected_logps", column=all_ref_rejected_logps
+            )
+
+            self._precomputed_train_ref_log_probs = True
+
+        return super().get_train_dataloader()
+
+    def get_eval_dataloader(self, eval_dataset: Dataset | None = None) -> DataLoader:
+        """
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`.
+
+        Args:
+            eval_dataset (`torch.utils.data.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
+                by the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
+        if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs:
+            batch_size = (
+                self.args.precompute_ref_batch_size
+                or self.args.per_device_eval_batch_size
+            )
+            dataloader_params = {
+                "batch_size": batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(
+                DataLoader(eval_dataset, **dataloader_params)
+            )
+
+            ref_chosen_logps = []
+            ref_rejected_logps = []
+            for padded_batch in tqdm(
+                iterable=data_loader, desc="Eval dataset reference log probs"
+            ):
+                ref_chosen_logp, ref_rejected_logp = self.compute_ref_log_probs(
+                    padded_batch
+                )
+                ref_chosen_logp, ref_rejected_logp = (
+                    self.accelerator.gather_for_metrics(
+                        (ref_chosen_logp, ref_rejected_logp)
+                    )
+                )
+                ref_chosen_logps.append(ref_chosen_logp.cpu())
+                ref_rejected_logps.append(ref_rejected_logp.cpu())
+
+            all_ref_chosen_logps = torch.cat(ref_chosen_logps).float().numpy()
+            all_ref_rejected_logps = torch.cat(ref_rejected_logps).float().numpy()
+
+            eval_dataset = eval_dataset.add_column(
+                name="ref_chosen_logps", column=all_ref_chosen_logps
+            )
+            eval_dataset = eval_dataset.add_column(
+                name="ref_rejected_logps", column=all_ref_rejected_logps
+            )
+
+            # Save calculated ref_chosen_logps and ref_rejected_logps to the eval_dataset for subsequent runs
+            if self.eval_dataset is not None:
+                self.eval_dataset = eval_dataset
+            self._precomputed_eval_ref_log_probs = True
+
+        return super().get_eval_dataloader(eval_dataset=eval_dataset)
+
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with (
+            self.accelerator.unwrap_model(self.model).disable_adapter()
+            if self.is_peft_model and not self.ref_adapter_name
+            else nullcontext()
+        ):
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.model_adapter_name or "default")
+
+    def compute_ref_log_probs(
+        self, batch: dict[str, torch.LongTensor]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Computes log probabilities of the reference model for a single padded batch of a DPO specific dataset."""
+        compte_ref_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+        with torch.no_grad(), compte_ref_context_manager:
+            if self.ref_model is None:
+                with self.null_ref_context():
+                    ref_model_output = self.concatenated_forward(
+                        self.model, batch, is_ref_model=True
+                    )
+            else:
+                ref_model_output = self.concatenated_forward(
+                    self.ref_model, batch, is_ref_model=True
+                )
+        return ref_model_output["chosen_logps"], ref_model_output["rejected_logps"]
+
+    @staticmethod
+    def concatenated_inputs(
+        batch: dict[str, list | torch.LongTensor], padding_value: int
+    ) -> dict[str, torch.LongTensor]:
+        """
+        Concatenate the `chosen` and `rejected` inputs from the batch into a single tensor for both the prompt and
+        completion sequences.
+
+        Args:
+            batch (`dict[str, list | torch.LongTensor]`):
+                A batch of input data. The batch must contain the following keys:
+
+                - `"prompt_input_ids"`: Tensor of shape `(batch_size, prompt_length)` representing the prompt input
+                  IDs.
+                - `"chosen_input_ids"`: Tensor of shape `(batch_size, chosen_length)` representing the chosen
+                  completion input IDs.
+                - `"rejected_input_ids"`: Tensor of shape `(batch_size, rejected_length)` representing the rejected
+                  completion input IDs.
+                - `"prompt_pixel_values"` (optional): Tensor for pixel values, if available.
+                - `"prompt_pixel_attention_mask"` (optional): Tensor for pixel attention masks, if available.
+
+            padding_value (`int`):
+                The padding value to use for the concatenated completion sequences (`chosen_input_ids` and
+                `rejected_input_ids`).
+
+        Returns
+        -------
+            `dict[str, torch.LongTensor]`: A dictionary containing:
+
+                - `"prompt_input_ids"`: Concatenated prompt input IDs of shape `(2 * batch_size, prompt_length)`.
+                - `"completion_input_ids"`: Concatenated chosen and rejected completion input IDs of shape `(2 *
+                  batch_size, max_completion_length)`.
+                - `"prompt_attention_mask"`: Concatenated prompt attention masks of shape `(2 * batch_size,
+                  prompt_length)`.
+                - `"completion_attention_mask"`: Concatenated chosen and rejected attention masks of shape `(2 *
+                  batch_size, max_completion_length)`.
+                - `"pixel_values"` (optional): Concatenated pixel values if `"prompt_pixel_values"` are present.
+                - `"pixel_attention_mask"` (optional): Concatenated pixel attention masks if
+                  `"prompt_pixel_attention_mask"` are present.
+
+        Notes
+        -----
+            The completion input IDs and attention masks are padded to the maximum completion length of the chosen or
+            rejected sequences.
+        """
+        output = {}
+
+        # For the prompt, the input_ids are the same for both the chosen and rejected responses
+        output["prompt_input_ids"] = torch.cat(
+            [batch["prompt_input_ids"], batch["prompt_input_ids"]], dim=0
+        )
+        output["prompt_attention_mask"] = torch.cat(
+            [batch["prompt_attention_mask"], batch["prompt_attention_mask"]], dim=0
+        )
+        if "pixel_values" in batch:
+            output["pixel_values"] = torch.cat(
+                [batch["pixel_values"], batch["pixel_values"]], dim=0
+            )
+
+        if "pixel_attention_mask" in batch:
+            output["pixel_attention_mask"] = torch.cat(
+                [batch["pixel_attention_mask"], batch["pixel_attention_mask"]], dim=0
+            )
+        if "image_sizes" in batch:
+            output["image_sizes"] = torch.cat(
+                [batch["image_sizes"], batch["image_sizes"]], dim=0
+            )
+        if "token_type_ids" in batch:
+            output["token_type_ids"] = torch.cat(
+                (batch["token_type_ids"], batch["token_type_ids"])
+            )
+
+        # Concatenate the chosen and rejected completions
+        max_completion_length = max(
+            batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1]
+        )
+        output["completion_input_ids"] = torch.cat(
+            (
+                pad_to_length(
+                    batch["chosen_input_ids"],
+                    max_completion_length,
+                    pad_value=padding_value,
+                ),
+                pad_to_length(
+                    batch["rejected_input_ids"],
+                    max_completion_length,
+                    pad_value=padding_value,
+                ),
+            ),
+        )
+        output["completion_attention_mask"] = torch.cat(
+            (
+                pad_to_length(
+                    batch["chosen_attention_mask"], max_completion_length, pad_value=0
+                ),
+                pad_to_length(
+                    batch["rejected_attention_mask"], max_completion_length, pad_value=0
+                ),
+            ),
+        )
+
+        return output
+
+    def dpo_loss(
+        self,
+        chosen_logps: torch.FloatTensor,
+        rejected_logps: torch.FloatTensor,
+        ref_chosen_logps: torch.FloatTensor,
+        ref_rejected_logps: torch.FloatTensor,
+        loss_type: str = "sigmoid",
+        model_output: dict[str, torch.FloatTensor] = None,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """
+        Compute the DPO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            chosen_logps (`torch.FloatTensor`):
+                Log probabilities of the model for the chosen responses. Shape: `(batch_size,)`.
+            rejected_logps (`torch.FloatTensor`):
+                Log probabilities of the model for the rejected responses. Shape: `(batch_size,)`.
+            ref_chosen_logps (`torch.FloatTensor`):
+                Log probabilities of the reference model for the chosen responses. Shape: `(batch_size,)`.
+            ref_rejected_logps (`torch.FloatTensor`):
+                Log probabilities of the reference model for the rejected responses. Shape: `(batch_size,)`.
+            loss_type (`str`, defaults to `"sigmoid"`):
+                The type of loss to compute. One of:
+                - `"sigmoid"`: Sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+                - `"hinge"`: Hinge loss on the normalized likelihood from the
+                  [SLiC](https://huggingface.co/papers/2305.10425) paper.
+                - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+                - `"exo_pair"`: Pairwise EXO loss from the [EXO](https://huggingface.co/papers/2402.00856) paper.
+                - `"nca_pair"`: Pairwise NCA loss from the [NCA](https://huggingface.co/papers/2402.05369) paper.
+                - `"robust"`: Unbiased estimate of the DPO loss that is robust to preference noise from the [Robust
+                  DPO](https://huggingface.co/papers/2403.00409) paper.
+                - `"bco_pair"`: Pairwise BCO loss from the [BCO](https://huggingface.co/papers/2404.04656) paper.
+                - `"sppo_hard"`: SPPO loss with hard label from the [SPPO](https://huggingface.co/papers/2405.00675)
+                  paper.
+                - `"aot"`: AOT loss for paired datasets from the [AOT](https://huggingface.co/papers/2406.05882) paper.
+                - `"aot_pair"`: AOT loss for unpaired datasets from the [AOT](https://huggingface.co/papers/2406.05882)
+                  paper.
+                - `"discopop"`: DiscoPOP (a.k.a Log-Ratio Modulated Loss, LRML) loss from the
+                  [DiscoPOP](https://huggingface.co/papers/2406.08414) paper.
+                - `"apo_zero"`: APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
+                - `"apo_down"`: APO-down loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
+                - `"sft"`: Negative log-likelihood loss (standard supervised fine-tuning loss).
+            model_output (`dict[str, torch.FloatTensor]`, *optional*):
+                The output of the model's forward pass. This is used to compute auxiliary losses if enabled.
+
+        Returns
+        -------
+            A tuple of three tensors: `(losses, chosen_rewards, rejected_rewards)`. The losses tensor contains the DPO
+            loss for each example in the batch. The `chosen_rewards` and `rejected_rewards` tensors contain the rewards
+            for the chosen and rejected responses, respectively.
+        """
+        device = self.accelerator.device
+
+        # Get the log ratios for the chosen and rejected responses
+        chosen_logratios = chosen_logps.to(device) - (
+            not self.reference_free
+        ) * ref_chosen_logps.to(device)
+        rejected_logratios = rejected_logps.to(device) - (
+            not self.reference_free
+        ) * ref_rejected_logps.to(device)
+
+        if self.f_divergence_type == FDivergenceType.ALPHA_DIVERGENCE:
+            # The alpha-divergence formula: (1 - u^-alpha) / alpha
+            # The divergence difference between the chosen and rejected sample is:
+            #     (1 - u[w]^-alpha) / alpha - (1 - u[l]^-alpha) / alpha
+            #        = (u[l]^-alpha - u[w]^-alpha) / alpha
+            # where u[w] and u[l] are the policy/reference probability ratios
+            # for the chosen and rejected samples, respectively.
+            alpha_coef = FDivergenceConstants.ALPHA_DIVERGENCE_COEF_DEFAULT
+            if (
+                self.f_divergence_params
+                and FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY
+                in self.f_divergence_params
+            ):
+                alpha_coef = float(
+                    self.f_divergence_params[
+                        FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY
+                    ]
+                )
+            logits = (
+                cap_exp(rejected_logratios * -alpha_coef)
+                - cap_exp(chosen_logratios * -alpha_coef)
+            ) / alpha_coef
+        else:
+            logratios = chosen_logps - rejected_logps
+            if self.reference_free:
+                ref_logratios = torch.tensor(
+                    [0], dtype=logratios.dtype, device=logratios.device
+                )
+            else:
+                ref_logratios = ref_chosen_logps - ref_rejected_logps
+
+            logratios = logratios.to(self.accelerator.device)
+            ref_logratios = ref_logratios.to(self.accelerator.device)
+            logits = logratios - ref_logratios
+
+            # ============================
+            # SAFE-DPO MODIFICATION START
+            # ============================
+
+            # Only apply if safety labels exist in the batch
+            if "h_w" in model_output and "h_l" in model_output:
+                h_w = model_output["h_w"].float().to(logits.device)
+                h_l = model_output["h_l"].float().to(logits.device)
+                Delta = self.args.delta
+
+                # Debug only first few steps to avoid spam
+                if self.state.global_step < 5:
+                    print("\n[SafeDPO DEBUG]")
+                    print("Delta:", Delta)
+                    print("h_w[:10]:", h_w[:10])
+                    print("h_l[:10]:", h_l[:10])
+                    print("(h_l - h_w)[:10]:", (h_l - h_w)[:10])
+                    print("logits BEFORE:", logits[:10])
+
+                # SafeDPO modifies the margin BEFORE the sigmoid
+                logits = logits - (h_l - h_w) * Delta
+
+                # ============================
+                # SAFE-DPO MODIFICATION END
+                # ===========================
+
+            if self.f_divergence_type == FDivergenceType.JS_DIVERGENCE:
+                # The js-divergence formula: log(2 * u / (1 + u))
+                # The divergence difference between the chosen and rejected sample is:
+                #     log(2 * u[w] / (1 + u[w])) - log(2 * u[l] / (1 + u[l]))
+                #       = log(u[w]) - log(u[l]) - (log(1 + u[w]) - log(1 + u[l]))
+                # where u[w] and u[l] are the policy/reference probability ratios
+                # for the chosen and rejected samples, respectively.
+                logits -= F.softplus(chosen_logratios) - F.softplus(rejected_logratios)
+
+        # The beta is a temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5.
+        # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the
+        # labels and calculates a conservative DPO loss.
+        if loss_type == "sigmoid":
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+
+        elif loss_type == "robust":
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                + F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            ) / (1 - 2 * self.label_smoothing)
+
+        elif loss_type == "exo_pair":
+            # eqn (16) of the EXO paper: https://huggingface.co/papers/2402.00856
+            import math
+
+            if self.label_smoothing == 0:
+                self.label_smoothing = 1e-3
+            losses = (self.beta * logits).sigmoid() * (
+                F.logsigmoid(self.beta * logits) - math.log(1 - self.label_smoothing)
+            ) + (-self.beta * logits).sigmoid() * (
+                F.logsigmoid(-self.beta * logits) - math.log(self.label_smoothing)
+            )
+
+        elif loss_type == "hinge":
+            losses = torch.relu(1 - self.beta * logits)
+
+        elif loss_type == "ipo":
+            # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper.
+            losses = (logits - 1 / (2 * self.beta)) ** 2
+
+        elif loss_type == "bco_pair":
+            chosen_logratios = chosen_logps - ref_chosen_logps
+            rejected_logratios = rejected_logps - ref_rejected_logps
+            chosen_rewards = self.beta * chosen_logratios
+            rejected_rewards = self.beta * rejected_logratios
+            rewards = torch.cat((chosen_rewards, rejected_rewards), 0).mean().detach()
+            self.running.update(rewards)
+            delta = self.running.mean
+            losses = -F.logsigmoid(
+                (self.beta * chosen_logratios) - delta
+            ) - F.logsigmoid(-(self.beta * rejected_logratios - delta))
+
+        elif loss_type == "sppo_hard":
+            # In the paper (https://huggingface.co/papers/2405.00675), SPPO employs a soft probability approach,
+            # estimated using the PairRM score. The probability calculation is conducted outside of the trainer class.
+            # The version described here is the hard probability version, where P in Equation (4.7) of Algorithm 1 is
+            # set to 1 for the winner and 0 for the loser.
+            a = chosen_logps - ref_chosen_logps
+            b = rejected_logps - ref_rejected_logps
+            losses = (a - 0.5 / self.beta) ** 2 + (b + 0.5 / self.beta) ** 2
+
+        elif loss_type == "nca_pair":
+            chosen_rewards = (chosen_logps - ref_chosen_logps) * self.beta
+            rejected_rewards = (rejected_logps - ref_rejected_logps) * self.beta
+            losses = (
+                -F.logsigmoid(chosen_rewards)
+                - 0.5 * F.logsigmoid(-chosen_rewards)
+                - 0.5 * F.logsigmoid(-rejected_rewards)
+            )
+
+        elif loss_type == "aot_pair":
+            chosen_logratios = chosen_logps - ref_chosen_logps
+            rejected_logratios = rejected_logps - ref_rejected_logps
+            chosen_logratios_sorted, _ = torch.sort(chosen_logratios, dim=0)
+            rejected_logratios_sorted, _ = torch.sort(rejected_logratios, dim=0)
+            delta = chosen_logratios_sorted - rejected_logratios_sorted
+            losses = (
+                -F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * delta) * self.label_smoothing
+            )
+
+        elif loss_type == "aot":
+            logratios = chosen_logps - rejected_logps
+            ref_logratios = ref_chosen_logps - ref_rejected_logps
+            logratios_sorted, _ = torch.sort(logratios, dim=0)
+            ref_logratios_sorted, _ = torch.sort(ref_logratios, dim=0)
+            delta = logratios_sorted - ref_logratios_sorted
+            losses = (
+                -F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * delta) * self.label_smoothing
+            )
+
+        elif loss_type == "apo_zero":
+            # Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are better than your model's default output
+            losses_chosen = 1 - F.sigmoid(
+                self.beta * chosen_logratios
+            )  # Increase chosen likelihood
+            losses_rejected = F.sigmoid(
+                self.beta * rejected_logratios
+            )  # Decrease rejected likelihood
+            losses = losses_chosen + losses_rejected
+
+        elif loss_type == "apo_down":
+            # Eqn (8) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are worse than your model's default output.
+            # Decrease chosen likelihood and decrease rejected likelihood more
+            losses_chosen = F.sigmoid(self.beta * chosen_logratios)
+            losses_rejected = 1 - F.sigmoid(
+                self.beta * (chosen_logratios - rejected_logratios)
+            )
+            losses = losses_chosen + losses_rejected
+
+        elif loss_type == "discopop":
+            # Eqn (5) of the DiscoPOP paper (https://huggingface.co/papers/2406.08414)
+            # This loss was discovered with LLM discovery
+            logratios = chosen_logps - rejected_logps
+            ref_logratios = ref_chosen_logps - ref_rejected_logps
+            logits = logratios - ref_logratios
+            logits = logits * self.beta
+            # Modulate the mixing coefficient based on the log ratio magnitudes
+            log_ratio_modulation = torch.sigmoid(logits / self.args.discopop_tau)
+            logistic_component = -F.logsigmoid(logits)
+            exp_component = torch.exp(-logits)
+            # Blend between logistic and exponential component based on log ratio modulation
+            losses = (
+                logistic_component * (1 - log_ratio_modulation)
+                + exp_component * log_ratio_modulation
+            )
+
+        elif loss_type == "sft":
+            # SFT loss is the negative log likelihood loss on chosen responses
+            # This acts as the generation loss component in MPO
+            sft_loss = model_output["nll_loss"]
+            # Create losses tensor with same shape as other losses (per-sample)
+            batch_size = chosen_logps.shape[0]
+            losses = sft_loss.expand(batch_size)
+            # For SFT, we don't have preference rewards, so use zeros
+            chosen_rewards = torch.zeros_like(chosen_logps)
+            rejected_rewards = torch.zeros_like(rejected_logps)
+
+        else:
+            raise ValueError(
+                f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'exo_pair', "
+                "'nca_pair', 'robust', 'bco_pair', 'sppo_hard', 'aot', 'aot_pair', 'discopop', 'apo_zero', "
+                "'apo_down', 'sft']"
+            )
+
+        chosen_rewards = (
+            self.beta * (chosen_logps.to(device) - ref_chosen_logps.to(device)).detach()
+        )
+        rejected_rewards = (
+            self.beta
+            * (rejected_logps.to(device) - ref_rejected_logps.to(device)).detach()
+        )
+
+        return losses, chosen_rewards, rejected_rewards
+
+    def _compute_loss_liger(
+        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
+    ) -> dict[str, torch.Tensor]:
+        unwrapped_model = self.accelerator.unwrap_model(model)
+        concatenated_batch = self.concatenated_inputs(
+            batch, padding_value=self.pad_token_id
+        )
+
+        model_kwargs = {}
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        # Add the pixel values and attention masks for vision models
+        if "pixel_values" in concatenated_batch:
+            model_kwargs["pixel_values"] = concatenated_batch["pixel_values"]
+        if "pixel_attention_mask" in concatenated_batch:
+            model_kwargs["pixel_attention_mask"] = concatenated_batch[
+                "pixel_attention_mask"
+            ]
+        if "image_sizes" in concatenated_batch:
+            model_kwargs["image_sizes"] = concatenated_batch["image_sizes"]
+
+        prompt_attention_mask = concatenated_batch["prompt_attention_mask"]
+        completion_attention_mask = concatenated_batch["completion_attention_mask"]
+
+        if self.is_encoder_decoder:
+            # 1. Get encoder outputs
+            encoder_outputs = unwrapped_model.get_encoder()(
+                concatenated_batch["prompt_input_ids"],
+                attention_mask=concatenated_batch["prompt_attention_mask"],
+                return_dict=True,
+            )
+            # 2. Prepare decoder inputs
+            decoder_input_ids = shift_tokens_right(
+                concatenated_batch["completion_input_ids"],
+                unwrapped_model.config.decoder_start_token_id,
+            )
+            # 3. Get decoder outputs
+            decoder_outputs = unwrapped_model.get_decoder()(
+                input_ids=decoder_input_ids,
+                attention_mask=concatenated_batch["completion_attention_mask"],
+                encoder_hidden_states=encoder_outputs.last_hidden_state,
+                encoder_attention_mask=concatenated_batch["prompt_attention_mask"],
+                use_cache=False,
+            )
+            hidden_states = decoder_outputs.last_hidden_state
+
+            ref_hidden_states = None
+            if not self.reference_free and self.ref_model is not None:
+                unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model)
+                ref_encoder_outputs = unwrapped_ref_model.get_encoder()(
+                    concatenated_batch["prompt_input_ids"],
+                    attention_mask=concatenated_batch["prompt_attention_mask"],
+                    return_dict=True,
+                )
+                ref_decoder_outputs = unwrapped_ref_model.get_decoder()(
+                    input_ids=decoder_input_ids,
+                    attention_mask=concatenated_batch["completion_attention_mask"],
+                    encoder_hidden_states=ref_encoder_outputs.last_hidden_state,
+                    encoder_attention_mask=concatenated_batch["prompt_attention_mask"],
+                    use_cache=False,
+                )
+                ref_hidden_states = ref_decoder_outputs.last_hidden_state
+            elif not self.reference_free:
+                with self.null_ref_context():
+                    ref_encoder_outputs = unwrapped_model.get_encoder()(
+                        concatenated_batch["prompt_input_ids"],
+                        attention_mask=concatenated_batch["prompt_attention_mask"],
+                        return_dict=True,
+                    )
+                    ref_decoder_outputs = unwrapped_model.get_decoder()(
+                        input_ids=decoder_input_ids,
+                        attention_mask=concatenated_batch["completion_attention_mask"],
+                        encoder_hidden_states=ref_encoder_outputs.last_hidden_state,
+                        encoder_attention_mask=concatenated_batch[
+                            "prompt_attention_mask"
+                        ],
+                        use_cache=False,
+                    )
+                    ref_hidden_states = ref_decoder_outputs.last_hidden_state
+
+            labels = concatenated_batch["completion_input_ids"]
+            loss_mask = completion_attention_mask.bool()
+        else:
+            # For decoder-only models
+            input_ids = torch.cat(
+                (
+                    concatenated_batch["prompt_input_ids"],
+                    concatenated_batch["completion_input_ids"],
+                ),
+                dim=1,
+            )
+            attention_mask = torch.cat(
+                (
+                    concatenated_batch["prompt_attention_mask"],
+                    concatenated_batch["completion_attention_mask"],
+                ),
+                dim=1,
+            )
+            # Mask the prompt but not the completion for the loss
+            loss_mask = torch.cat(
+                (torch.zeros_like(prompt_attention_mask), completion_attention_mask),
+                dim=1,
+            )
+
+            # Flush and truncate
+            if self.max_length is not None and self.max_length < attention_mask.size(1):
+                if self.truncation_mode == "keep_start":
+                    # Flush left to reduce the memory usage
+                    # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+                    #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+                    attention_mask, input_ids, loss_mask = flush_left(
+                        attention_mask, input_ids, loss_mask
+                    )
+                    attention_mask = attention_mask[:, : self.max_length]
+                    input_ids = input_ids[:, : self.max_length]
+                    loss_mask = loss_mask[:, : self.max_length]
+                elif self.truncation_mode == "keep_end":
+                    # Flush right before truncating left, then flush left
+                    # [[0, 0, x, x, x, x],  ->  [[0, 0, x, x],
+                    #  [0, x, x, x, 0, 0]]       [0, x, x, x]]
+                    attention_mask, input_ids, loss_mask = flush_right(
+                        attention_mask, input_ids, loss_mask
+                    )
+                    input_ids = input_ids[:, -self.max_length :]
+                    attention_mask = attention_mask[:, -self.max_length :]
+                    loss_mask = loss_mask[:, -self.max_length :]
+                    attention_mask, input_ids, loss_mask = flush_left(
+                        attention_mask, input_ids, loss_mask
+                    )
+                else:
+                    raise ValueError(
+                        f"Unknown truncation mode: '{self.truncation_mode}'. Should be one of ['keep_end', "
+                        "'keep_start']."
+                    )
+            else:
+                # Flush left to reduce the memory usage
+                # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+                #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+                attention_mask, input_ids, loss_mask = flush_left(
+                    attention_mask, input_ids, loss_mask
+                )
+
+            # Add logits_to_keep optimization
+            if self.use_logits_to_keep:
+                first_compute_index = loss_mask.nonzero(as_tuple=True)[1].min()
+                logits_to_keep = (loss_mask.shape[1] - first_compute_index).item() + 1
+                model_kwargs["logits_to_keep"] = logits_to_keep
+
+            model_kwargs["output_hidden_states"] = True
+
+            # Add padding-free training support
+            if self.padding_free:
+                input_ids = input_ids[attention_mask.bool()].unsqueeze(0)
+                loss_mask = loss_mask[attention_mask.bool()].unsqueeze(0)
+                position_ids = (
+                    attention_mask.cumsum(1)[attention_mask.bool()].unsqueeze(0) - 1
+                )
+                model_kwargs["position_ids"] = position_ids
+            else:
+                model_kwargs["attention_mask"] = attention_mask
+
+            # Get the base model outputs (before LM head)
+            if (
+                hasattr(unwrapped_model, "get_decoder")
+                and unwrapped_model.get_decoder() is not None
+            ):
+                base_model = unwrapped_model.get_decoder()
+            else:
+                base_attr = getattr(
+                    unwrapped_model,
+                    "base_model_prefix",
+                    self.args.base_model_attribute_name,
+                )
+                base_model = getattr(unwrapped_model, base_attr, unwrapped_model)
+
+            outputs = base_model(
+                input_ids,
+                use_cache=False,
+                **model_kwargs,
+            )
+            hidden_states = outputs.last_hidden_state[:, :-1]
+
+            # Get reference hidden states if needed
+            ref_hidden_states = None
+            if not self.reference_free and self.ref_model is not None:
+                unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model)
+                if (
+                    hasattr(unwrapped_ref_model, "get_decoder")
+                    and unwrapped_ref_model.get_decoder() is not None
+                ):
+                    ref_base_model = unwrapped_ref_model.get_decoder()
+                else:
+                    ref_attr = getattr(
+                        unwrapped_ref_model,
+                        "base_model_prefix",
+                        self.args.base_model_attribute_name,
+                    )
+                    ref_base_model = getattr(
+                        unwrapped_ref_model, ref_attr, unwrapped_ref_model
+                    )
+
+                ref_outputs = ref_base_model(
+                    input_ids,
+                    use_cache=False,
+                    **model_kwargs,
+                )
+                ref_hidden_states = ref_outputs.last_hidden_state[:, :-1]
+            elif not self.reference_free:
+                if (
+                    hasattr(unwrapped_model, "get_decoder")
+                    and unwrapped_model.get_decoder() is not None
+                ):
+                    ref_base_model = unwrapped_model.get_decoder()
+                else:
+                    ref_attr = getattr(
+                        unwrapped_model,
+                        "base_model_prefix",
+                        self.args.base_model_attribute_name,
+                    )
+                    ref_base_model = getattr(unwrapped_model, ref_attr, unwrapped_model)
+                with self.null_ref_context():
+                    ref_outputs = ref_base_model(
+                        input_ids,
+                        use_cache=False,
+                        **model_kwargs,
+                    )
+                    ref_hidden_states = ref_outputs.last_hidden_state[:, :-1]
+
+            masked_input_ids = torch.where(
+                loss_mask != 0, input_ids, self.label_pad_token_id
+            )
+            labels = masked_input_ids[:, 1:]  # Shift right for casual LM
+
+        # Get the LM head
+        lm_head = unwrapped_model.get_output_embeddings()
+
+        # Get reference model weights if needed
+        ref_weight = None
+        ref_bias = None
+        if not self.reference_free:
+            if self.ref_model is not None:
+                unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model)
+                ref_lm_head = unwrapped_ref_model.get_output_embeddings()
+            else:
+                with self.null_ref_context():
+                    ref_lm_head = unwrapped_model.get_output_embeddings()
+            ref_weight = ref_lm_head.weight
+            ref_bias = ref_lm_head.bias if hasattr(ref_lm_head, "bias") else None
+
+        # Compute loss using Liger kernel
+        loss_output = self.dpo_loss_fn(
+            lm_head.weight,
+            hidden_states,
+            labels,
+            bias=lm_head.bias if hasattr(lm_head, "bias") else None,
+            ref_input=ref_hidden_states if not self.reference_free else None,
+            ref_weight=ref_weight if not self.reference_free else None,
+            ref_bias=ref_bias if not self.reference_free else None,
+        )
+        (
+            loss,
+            (
+                chosen_logps,
+                rejected_logps,
+                chosen_logits_mean,
+                rejected_logits_mean,
+                nll_loss,
+                *aux_outputs,
+            ),
+        ) = loss_output
+
+        output = {
+            "loss": loss,
+            "chosen_logps": chosen_logps,
+            "rejected_logps": rejected_logps,
+            "mean_chosen_logits": chosen_logits_mean,
+            "mean_rejected_logits": rejected_logits_mean,
+            "nll_loss": nll_loss,
+            "chosen_rewards": aux_outputs[0],
+            "rejected_rewards": aux_outputs[1],
+        }
+        if self.aux_loss_enabled:
+            output["aux_loss"] = outputs.aux_loss
+
+        return output
+
+    def concatenated_forward(
+        self,
+        model: nn.Module,
+        batch: dict[str, list | torch.LongTensor],
+        is_ref_model: bool = False,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Runs the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+
+        Args:
+            model:
+                Model to run the forward pass on.
+            batch:
+                Batch of input data.
+            is_ref_model:
+                Whether this method is being called for the reference model. If `True`, length desensitization is not
+                applied.
+        """
+        num_examples = batch["prompt_input_ids"].shape[0]
+
+        concatenated_batch = self.concatenated_inputs(
+            batch, padding_value=self.pad_token_id
+        )
+
+        model_kwargs = {"use_cache": False}
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        # Add the pixel values and attention masks for vision models
+        if "pixel_values" in concatenated_batch:
+            model_kwargs["pixel_values"] = concatenated_batch["pixel_values"]
+        if "pixel_attention_mask" in concatenated_batch:
+            model_kwargs["pixel_attention_mask"] = concatenated_batch[
+                "pixel_attention_mask"
+            ]
+        if "image_sizes" in concatenated_batch:
+            model_kwargs["image_sizes"] = concatenated_batch["image_sizes"]
+
+        prompt_input_ids = concatenated_batch["prompt_input_ids"]
+        prompt_attention_mask = concatenated_batch["prompt_attention_mask"]
+        completion_input_ids = concatenated_batch["completion_input_ids"]
+        completion_attention_mask = concatenated_batch["completion_attention_mask"]
+        if self.is_encoder_decoder:
+            labels = completion_input_ids
+            labels[completion_attention_mask == 0] = self.label_pad_token_id
+            outputs = model(
+                input_ids=prompt_input_ids,
+                attention_mask=prompt_attention_mask,
+                labels=labels,  # we need the labels for the logits to be returned
+                **model_kwargs,
+            )
+            logits = outputs.logits
+            loss_mask = completion_attention_mask.bool()
+        else:
+            # Concatenate the prompt and completion inputs
+            input_ids = torch.cat((prompt_input_ids, completion_input_ids), dim=1)
+            attention_mask = torch.cat(
+                (prompt_attention_mask, completion_attention_mask), dim=1
+            )
+            if "token_type_ids" in concatenated_batch:
+                prompt_token_type_ids = concatenated_batch["token_type_ids"]
+                token_type_ids = pad_to_length(
+                    prompt_token_type_ids, input_ids.shape[1], 0
+                )
+            # Mask the prompt but not the completion for the loss
+            loss_mask = torch.cat(
+                (torch.zeros_like(prompt_attention_mask), completion_attention_mask),
+                dim=1,
+            )
+
+            # Flush and truncate
+            if self.max_length is not None and self.max_length < attention_mask.size(1):
+                if self.truncation_mode == "keep_start":
+                    # Flush left to reduce the memory usage
+                    # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+                    #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+                    if "token_type_ids" in concatenated_batch:
+                        attention_mask, input_ids, loss_mask, token_type_ids = (
+                            flush_left(
+                                attention_mask, input_ids, loss_mask, token_type_ids
+                            )
+                        )
+                    else:
+                        attention_mask, input_ids, loss_mask = flush_left(
+                            attention_mask, input_ids, loss_mask
+                        )
+                    attention_mask = attention_mask[:, : self.max_length]
+                    input_ids = input_ids[:, : self.max_length]
+                    loss_mask = loss_mask[:, : self.max_length]
+                elif self.truncation_mode == "keep_end":
+                    # Flush right before truncating left, then flush left
+                    # [[0, 0, x, x, x, x],  ->  [[0, 0, x, x],
+                    #  [0, x, x, x, 0, 0]]       [0, x, x, x]]
+                    if "token_type_ids" in concatenated_batch:
+                        attention_mask, input_ids, loss_mask, token_type_ids = (
+                            flush_left(
+                                attention_mask, input_ids, loss_mask, token_type_ids
+                            )
+                        )
+                        token_type_ids = token_type_ids[:, -self.max_length :]
+                    else:
+                        attention_mask, input_ids, loss_mask = flush_right(
+                            attention_mask, input_ids, loss_mask
+                        )
+                    input_ids = input_ids[:, -self.max_length :]
+                    attention_mask = attention_mask[:, -self.max_length :]
+                    loss_mask = loss_mask[:, -self.max_length :]
+                    if "token_type_ids" in concatenated_batch:
+                        attention_mask, input_ids, loss_mask, token_type_ids = (
+                            flush_left(
+                                attention_mask, input_ids, loss_mask, token_type_ids
+                            )
+                        )
+                    else:
+                        attention_mask, input_ids, loss_mask = flush_left(
+                            attention_mask, input_ids, loss_mask
+                        )
+                else:
+                    raise ValueError(
+                        f"Unknown truncation mode: '{self.truncation_mode}'. Should be one of ['keep_end', "
+                        "'keep_start']."
+                    )
+            # Flush left to reduce the memory usage
+            # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+            #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+            elif "token_type_ids" in concatenated_batch:
+                attention_mask, input_ids, loss_mask, token_type_ids = flush_left(
+                    attention_mask, input_ids, loss_mask, token_type_ids
+                )
+            else:
+                attention_mask, input_ids, loss_mask = flush_left(
+                    attention_mask, input_ids, loss_mask
+                )
+
+            if "token_type_ids" in concatenated_batch:
+                model_kwargs["token_type_ids"] = token_type_ids
+
+            if self.use_logits_to_keep:
+                # Compute logits_to_keep based on loss_mask pattern:
+                # [[0, 0, 0, x, x, x, x],
+                #  [0, 0, 0, x, x, x, 0]]
+                #         ^ start computing logits from here ([:, -(7-3+1):])
+                first_compute_index = loss_mask.nonzero(as_tuple=True)[1].min()
+                logits_to_keep = (
+                    loss_mask.shape[1] - first_compute_index
+                ).item() + 1  # +1 for the first label
+                model_kwargs["logits_to_keep"] = logits_to_keep
+
+            model_kwargs["output_hidden_states"] = True
+
+            if self.padding_free:
+                # Flatten the input_ids, position_ids, and loss_mask
+                # input_ids = [[a, b, c, 0], ->     input_ids = [[a, b, c, d, e, f, g]]
+                #              [d, e, f, g]]     position_ids = [[0, 1, 2, 0, 1, 2, 3]]
+                input_ids = input_ids[attention_mask.bool()].unsqueeze(0)
+                loss_mask = loss_mask[attention_mask.bool()].unsqueeze(0)
+                position_ids = (
+                    attention_mask.cumsum(1)[attention_mask.bool()].unsqueeze(0) - 1
+                )
+                model_kwargs["position_ids"] = position_ids
+            else:
+                model_kwargs["attention_mask"] = attention_mask
+
+            outputs = model(input_ids, **model_kwargs)
+            logits = outputs.logits
+
+            # Offset the logits by one to align with the labels
+            labels = torch.roll(input_ids, shifts=-1, dims=1)
+            loss_mask = torch.roll(loss_mask, shifts=-1, dims=1).bool()
+
+            if self.use_logits_to_keep:
+                # Align labels with logits
+                # logits:    -,  -, [x2, x3, x4, x5, x6]
+                #                     ^ --------- ^       after logits[:, :-1, :]
+                # labels:   [y0, y1, y2, y3, y4, y5, y6]
+                #                         ^ --------- ^   with logits_to_keep=4, [:, -4:]
+                # loss_mask: [0,  0,  0,  1,  1,  1,  1]
+                labels = labels[:, -logits_to_keep:]
+                loss_mask = loss_mask[:, -logits_to_keep:]
+
+        if logits.shape[:2] != labels.shape[:2]:
+            # for LLaVA, the returned logits include the image tokens (placed before the text tokens)
+            seq_len = labels.shape[1]
+            logits = logits[:, -seq_len:]
+
+        # Compute the log probabilities of the labels
+        labels[~loss_mask] = (
+            0  # dummy token; we'll ignore the losses on these tokens later
+        )
+        per_token_logps = selective_log_softmax(logits, labels)
+        per_token_logps[~loss_mask] = 0
+        per_token_logps = torch.roll(per_token_logps, shifts=1, dims=1)
+
+        if self.padding_free:
+            # Unflatten the per_token_logps (shape: [1, sum_seq_len] -> [batch_size, seq_len])
+            batch_size, seq_len = attention_mask.shape
+            per_token_logps_ = torch.zeros(
+                batch_size,
+                seq_len,
+                device=outputs.logits.device,
+                dtype=outputs.logits.dtype,
+            )
+            per_token_logps_[attention_mask.bool()] = per_token_logps
+            per_token_logps = per_token_logps_
+
+        all_logps = per_token_logps[:, 1:].sum(-1)
+
+        output = {}
+
+        if self.use_weighting:
+            with torch.no_grad():
+                # Eq (2) of the WPO paper: https://huggingface.co/papers/2406.11827
+                logprobs = F.log_softmax(logits, dim=-1)
+                weights_adjustment_factor = torch.logsumexp(
+                    2 * logprobs, dim=-1
+                )  # same as sum(probs**2) in log space
+                per_token_logps_adjusted = per_token_logps - weights_adjustment_factor
+                all_weights = (per_token_logps_adjusted * loss_mask).sum(
+                    -1
+                ) / loss_mask.sum(-1)
+                chosen_weights = all_weights[:num_examples]
+                rejected_weights = all_weights[num_examples:]
+                output["policy_weights"] = torch.clamp(
+                    torch.exp(chosen_weights + rejected_weights), max=1
+                )
+
+        if self.args.rpo_alpha is not None or "sft" in self.loss_type:
+            # Only use the chosen logits for the RPO loss or SFT loss
+            chosen_logits = (
+                logits[:num_examples, :-1]
+                if not self.is_encoder_decoder
+                else logits[:num_examples]
+            )
+            chosen_labels = (
+                labels[:num_examples, :-1]
+                if not self.is_encoder_decoder
+                else labels[:num_examples]
+            )
+
+            # Compute the log probabilities of the labels
+            output["nll_loss"] = F.cross_entropy(
+                torch.flatten(chosen_logits, end_dim=1),
+                torch.flatten(chosen_labels, end_dim=1),
+                ignore_index=0,
+            )
+
+        if "ipo" in self.loss_type:
+            all_logps = all_logps / loss_mask.sum(-1)
+
+        if self.args.ld_alpha is not None and not is_ref_model:
+            # Compute response lengths based on loss_mask
+            completion_lengths = loss_mask.sum(dim=1)
+
+            chosen_lengths = completion_lengths[:num_examples]
+            rejected_lengths = completion_lengths[num_examples:]
+            public_lengths = torch.min(
+                chosen_lengths, rejected_lengths
+            )  # l_p in the paper
+            public_lengths = torch.cat([public_lengths, public_lengths], dim=0)
+
+            seq_len = per_token_logps.size(1)
+            position_ids = torch.arange(
+                seq_len, device=per_token_logps.device
+            ).expand_as(per_token_logps)
+
+            ld_mask = position_ids < public_lengths.unsqueeze(1)
+            mask = position_ids < completion_lengths.unsqueeze(1)
+
+            front_mask = (ld_mask & mask).float()
+            rear_mask = (~ld_mask & mask).float()
+            front_logps = (per_token_logps * front_mask).sum(dim=1)
+            rear_logps = (per_token_logps * rear_mask).sum(dim=1)
+
+            all_logps = front_logps + self.args.ld_alpha * rear_logps
+
+        output["chosen_logps"] = all_logps[:num_examples]
+        output["rejected_logps"] = all_logps[num_examples:]
+
+        # Compute the mean logits
+        if self.padding_free:
+            # position_ids contains a sequence of range identifiers (e.g., [[0, 1, 2, 0, 1, 2, 3, ...]]).
+            # There are 2*num_examples ranges in total: the first half corresponds to the chosen tokens,
+            # and the second half to the rejected tokens.
+            # To find the start of the rejected tokens, we look for the num_examples+1-th zero in pos_id.
+            split_idx = (position_ids == 0).nonzero(as_tuple=True)[1][num_examples]
+            mean_chosen_logits = logits[0, :split_idx][loss_mask[0, :split_idx]].mean()
+            mean_rejected_logits = logits[0, split_idx:][
+                loss_mask[0, split_idx:]
+            ].mean()
+        else:
+            mean_chosen_logits = logits[:num_examples][loss_mask[:num_examples]].mean()
+            mean_rejected_logits = logits[num_examples:][
+                loss_mask[num_examples:]
+            ].mean()
+
+        output["mean_chosen_logits"] = mean_chosen_logits
+        output["mean_rejected_logits"] = mean_rejected_logits
+
+        if self.aux_loss_enabled:
+            output["aux_loss"] = outputs.aux_loss
+
+        return output
+
+    def get_batch_loss_metrics(
+        self,
+        model: PreTrainedModel | nn.Module,
+        batch: dict[str, list | torch.LongTensor],
+        train_eval: Literal["train", "eval"] = "train",
+    ) -> tuple[torch.Tensor, dict[str, float]]:
+        """Compute the DPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+
+        if self.args.use_liger_kernel:
+            model_output = self._compute_loss_liger(model, batch)
+            losses = model_output["loss"]
+            chosen_rewards = model_output["chosen_rewards"]
+            rejected_rewards = model_output["rejected_rewards"]
+        else:
+            model_output = self.concatenated_forward(model, batch)
+
+            if "h_w" in batch:
+                model_output["h_w"] = batch["h_w"]
+            if "h_l" in batch:
+                model_output["h_l"] = batch["h_l"]
+
+            # if ref_chosen_logps and ref_rejected_logps in batch use them, otherwise use the reference model
+            if "ref_chosen_logps" in batch and "ref_rejected_logps" in batch:
+                ref_chosen_logps = batch["ref_chosen_logps"]
+                ref_rejected_logps = batch["ref_rejected_logps"]
+            else:
+                ref_chosen_logps, ref_rejected_logps = self.compute_ref_log_probs(batch)
+
+            # Initialize combined losses
+            losses = 0
+            chosen_rewards = 0
+            rejected_rewards = 0
+
+            # Compute losses for each loss type
+            for idx, loss_type in enumerate(self.loss_type):
+                # Compute individual loss using standard DPO loss function
+                _losses, _chosen_rewards, _rejected_rewards = self.dpo_loss(
+                    model_output["chosen_logps"],
+                    model_output["rejected_logps"],
+                    ref_chosen_logps,
+                    ref_rejected_logps,
+                    loss_type,
+                    model_output,
+                )
+
+                # Add weighted contributions
+                weight = self.loss_weights[idx] if self.loss_weights else 1.0
+                losses = losses + _losses * weight
+                chosen_rewards = chosen_rewards + _chosen_rewards * weight
+                rejected_rewards = rejected_rewards + _rejected_rewards * weight
+
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        if self.args.rpo_alpha is not None:
+            losses = (
+                losses + self.args.rpo_alpha * model_output["nll_loss"]
+            )  # RPO loss from V3 of the paper
+
+        if self.use_weighting:
+            losses = losses * model_output["policy_weights"]
+
+        if self.aux_loss_enabled:
+            losses = losses + self.aux_loss_coef * model_output["aux_loss"]
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards).mean().item()
+        )
+        metrics[f"{prefix}rewards/rejected"] = (
+            self.accelerator.gather_for_metrics(rejected_rewards).mean().item()
+        )
+        metrics[f"{prefix}rewards/accuracies"] = (
+            self.accelerator.gather_for_metrics(reward_accuracies).mean().item()
+        )
+        metrics[f"{prefix}rewards/margins"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards)
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logps/chosen"] = (
+            self.accelerator.gather_for_metrics(model_output["chosen_logps"])
+            .detach()
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logps/rejected"] = (
+            self.accelerator.gather_for_metrics(model_output["rejected_logps"])
+            .detach()
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logits/chosen"] = (
+            self.accelerator.gather_for_metrics(model_output["mean_chosen_logits"])
+            .detach()
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logits/rejected"] = (
+            self.accelerator.gather_for_metrics(model_output["mean_rejected_logits"])
+            .detach()
+            .mean()
+            .item()
+        )
+        if self.args.rpo_alpha is not None or "sft" in self.loss_type:
+            metrics[f"{prefix}nll_loss"] = (
+                self.accelerator.gather_for_metrics(model_output["nll_loss"])
+                .detach()
+                .mean()
+                .item()
+            )
+        if self.aux_loss_enabled:
+            metrics[f"{prefix}aux_loss"] = (
+                self.accelerator.gather_for_metrics(model_output["aux_loss"])
+                .detach()
+                .mean()
+                .item()
+            )
+
+        return losses.mean(), metrics
+
+    def compute_loss(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, float]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(
+                model, inputs, train_eval="train"
+            )
+
+        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
+        loss = loss.to(self.args.device)
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return loss, metrics
+
+        return loss
+
+    def generate_from_model_and_ref(
+        self, model, batch: dict[str, torch.LongTensor]
+    ) -> tuple[str, str]:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.pad_token_id,
+            )
+
+            # if ref_output in batch use that otherwise use the reference model
+            if "ref_output" in batch:
+                ref_output = batch["ref_output"]
+            elif self.ref_model is None:
+                with self.null_ref_context():
+                    ref_output = self.model.generate(
+                        input_ids=batch["prompt_input_ids"],
+                        attention_mask=batch["prompt_attention_mask"],
+                        max_length=self.max_length,
+                        do_sample=True,
+                        pad_token_id=self.pad_token_id,
+                    )
+            else:
+                ref_output = self.ref_model.generate(
+                    input_ids=batch["prompt_input_ids"],
+                    attention_mask=batch["prompt_attention_mask"],
+                    max_length=self.max_length,
+                    do_sample=True,
+                    pad_token_id=self.pad_token_id,
+                )
+
+        policy_output = pad_to_length(policy_output, self.max_length, self.pad_token_id)
+        policy_output_decoded = self.processing_class.batch_decode(
+            policy_output, skip_special_tokens=True
+        )
+
+        ref_output = pad_to_length(ref_output, self.max_length, self.pad_token_id)
+        ref_output_decoded = self.processing_class.batch_decode(
+            ref_output, skip_special_tokens=True
+        )
+
+        return policy_output_decoded, ref_output_decoded
+
+    def prediction_step(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        prediction_loss_only: bool,
+        ignore_keys: list[str] | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(
+                model, inputs, train_eval="eval"
+            )
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return loss.detach(), None, None
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def store_metrics(
+        self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train"
+    ) -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: bool | None = None,
+        ignore_keys: list[str] | None = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(
+                range(num_samples), k=self.args.eval_batch_size
+            )
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded, ref_output_decoded = (
+                self.generate_from_model_and_ref(self.model, random_batch)
+            )
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy", "Ref Model"],
+                data=[
+                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
+                    for prompt, pol, ref in zip(
+                        random_batch_dataset["prompt"],
+                        policy_output_decoded,
+                        ref_output_decoded,
+                        strict=True,
+                    )
+                ],
+            )
+            if "wandb" in self.args.report_to and self.accelerator.is_main_process:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+            if "mlflow" in self.args.report_to and self.accelerator.is_main_process:
+                mlflow.log_table(data=table, artifact_file="game_log.json")
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader,
+            description,
+            prediction_loss_only,
+            ignore_keys,
+            metric_key_prefix,
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float`, *optional*):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = torch.tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/run_dpo_training.py b/src/aixpert/training/training/run_dpo_training.py
new file mode 100644
index 0000000..6b10fd8
--- /dev/null
+++ b/src/aixpert/training/training/run_dpo_training.py
@@ -0,0 +1,59 @@
+"""Run baseline Original-DPO training."""
+
+from utils.config_loader import load_config
+from utils.trainer_utils import (
+    apply_lora,
+    build_dpo_trainer,
+    load_dataset_for_dpo,
+    load_model_and_tokenizer,
+)
+
+
+def train_single_model(model_name: str) -> None:
+    """Load config, dataset, model, and run Original-DPO training for one model."""
+    cfg = load_config()
+    mod = cfg["original_dpo"]
+    hp = mod["hyperparams"]
+    paths = mod["paths"]
+
+    print(f"Training model: {model_name}")
+
+    output_dir = f"{paths['output_root']}/{model_name.replace('/', '_')}_OriginalDPO"
+
+    train_data = load_dataset_for_dpo(paths["train"])
+    eval_data = load_dataset_for_dpo(paths["eval"])
+
+    model, tokenizer = load_model_and_tokenizer(
+        model_name,
+        hp["max_seq_length"],
+        hp["load_in_4bit"],
+    )
+
+    model = apply_lora(model, hp)
+
+    trainer = build_dpo_trainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_data=train_data,
+        eval_data=eval_data,
+        cfg=hp,
+        output_dir=output_dir,
+    )
+
+    print("Starting training...")
+    trainer.train()
+
+    trainer.save_model(output_dir)
+    tokenizer.save_pretrained(output_dir)
+
+    print(f"Finished training {model_name}. Output saved to {output_dir}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True)
+    args = parser.parse_args()
+
+    train_single_model(args.model)
diff --git a/src/aixpert/training/training/run_factual_training.py b/src/aixpert/training/training/run_factual_training.py
new file mode 100644
index 0000000..0dc3249
--- /dev/null
+++ b/src/aixpert/training/training/run_factual_training.py
@@ -0,0 +1,94 @@
+"""Run Factual-DPO training for a single model and a single Δ value."""
+
+import argparse
+import os
+
+import wandb
+from training.factualdpo_trainer import DataCollatorForPreference
+from training.factualdpo_trainer import FactualDPOTrainer as DPOTrainer
+from unsloth import PatchDPOTrainer
+from utils.config_loader import load_config
+from utils.factual_trainer_utils import (
+    build_dpo_config,
+    load_and_clean_jsonl,
+    load_unsloth_model,
+)
+
+
+PatchDPOTrainer()
+
+
+def train_one_model(model_id: str, short: str, delta: float) -> None:
+    """Load config, datasets, model, then run training for a single (model, Δ) pair."""
+    cfg = load_config()
+    mod = cfg["factual_dpo"]
+
+    train_file = mod["paths"]["train_file"]
+    eval_file = mod["paths"]["eval_file"]
+    output_root = mod["paths"]["output_root"]
+
+    hp = mod["hyperparams"]
+    wandb_cfg = mod["wandb"]
+
+    output_dir = os.path.join(output_root, f"{short}_delta{delta}")
+    os.makedirs(output_dir, exist_ok=True)
+
+    train_dataset = load_and_clean_jsonl(train_file).shuffle(seed=42)
+    eval_dataset = load_and_clean_jsonl(eval_file).shuffle(seed=42)
+
+    print(f"Train size: {len(train_dataset)}")
+    print(f"Eval size : {len(eval_dataset)}")
+
+    wandb.init(
+        project=wandb_cfg["project"],
+        entity=wandb_cfg["entity"],
+        name=f"{wandb_cfg['run_prefix']}_{short}_delta{delta}",
+        config={
+            "model_name": model_id,
+            "delta": delta,
+            "epochs": hp["num_train_epochs"],
+        },
+    )
+
+    model, tokenizer = load_unsloth_model(model_id, hp["max_seq_length"])
+
+    dpo_cfg = build_dpo_config(hp, tokenizer, delta, output_dir)
+    collator = DataCollatorForPreference(tokenizer.pad_token_id)
+
+    trainer = DPOTrainer(
+        model=model,
+        ref_model=None,
+        args=dpo_cfg,
+        data_collator=collator,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        processing_class=tokenizer,
+    )
+
+    print(f"\nStarting FactualDPO Training: {model_id} (Δ={delta})")
+    trainer.train()
+
+    trainer.save_model(output_dir)
+    tokenizer.save_pretrained(output_dir)
+
+    print(f"Saved model to: {output_dir}")
+    wandb.finish()
+
+
+def main() -> None:
+    """Parse CLI arguments and launch training for a single model–Δ combination."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_id", type=str, required=True)
+    parser.add_argument("--short", type=str, required=True)
+    parser.add_argument("--delta", type=float, required=True)
+    args = parser.parse_args()
+
+    train_one_model(
+        model_id=args.model_id,
+        short=args.short,
+        delta=args.delta,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/training/training/trl/__init__.py b/src/aixpert/training/training/trl/__init__.py
new file mode 100644
index 0000000..1e49ae1
--- /dev/null
+++ b/src/aixpert/training/training/trl/__init__.py
@@ -0,0 +1,218 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import warnings
+from importlib.metadata import PackageNotFoundError, version
+from typing import TYPE_CHECKING
+
+from .import_utils import _LazyModule
+
+
+try:
+    __version__ = version("trl")
+except PackageNotFoundError:
+    __version__ = "unknown"
+
+_import_structure = {
+    "scripts": [
+        "DatasetMixtureConfig",
+        "ScriptArguments",
+        "TrlParser",
+        "get_dataset",
+        "init_zero_verbose",
+    ],
+    "data_utils": [
+        "apply_chat_template",
+        "extract_prompt",
+        "is_conversational",
+        "is_conversational_from_value",
+        "maybe_apply_chat_template",
+        "maybe_convert_to_chatml",
+        "maybe_extract_prompt",
+        "maybe_unpair_preference_dataset",
+        "pack_dataset",
+        "prepare_multimodal_messages",
+        "prepare_multimodal_messages_vllm",
+        "truncate_dataset",
+        "unpair_preference_dataset",
+    ],
+    "models": [
+        "SUPPORTED_ARCHITECTURES",
+        "AutoModelForCausalLMWithValueHead",
+        "AutoModelForSeq2SeqLMWithValueHead",
+        "PreTrainedModelWrapper",
+        "clone_chat_template",
+        "create_reference_model",
+        "setup_chat_format",
+    ],
+    "trainer": [
+        "AllTrueJudge",
+        "BaseBinaryJudge",
+        "BaseJudge",
+        "BasePairwiseJudge",
+        "BaseRankJudge",
+        "BCOConfig",
+        "BCOTrainer",
+        "CPOConfig",
+        "CPOTrainer",
+        "DPOConfig",
+        "DPOTrainer",
+        "FDivergenceConstants",
+        "FDivergenceType",
+        "GKDConfig",
+        "GKDTrainer",
+        "GRPOConfig",
+        "GRPOTrainer",
+        "HfPairwiseJudge",
+        "KTOConfig",
+        "KTOTrainer",
+        "LogCompletionsCallback",
+        "ModelConfig",
+        "NashMDConfig",
+        "NashMDTrainer",
+        "OnlineDPOConfig",
+        "OnlineDPOTrainer",
+        "OpenAIPairwiseJudge",
+        "ORPOConfig",
+        "ORPOTrainer",
+        "PairRMJudge",
+        "PPOConfig",
+        "PPOTrainer",
+        "PRMConfig",
+        "PRMTrainer",
+        "RewardConfig",
+        "RewardTrainer",
+        "RLOOConfig",
+        "RLOOTrainer",
+        "SFTConfig",
+        "SFTTrainer",
+        "WinRateCallback",
+        "XPOConfig",
+        "XPOTrainer",
+    ],
+    "trainer.callbacks": [
+        "BEMACallback",
+        "MergeModelCallback",
+        "RichProgressCallback",
+        "SyncRefModelCallback",
+        "WeaveCallback",
+    ],
+    "trainer.utils": [
+        "get_kbit_device_map",
+        "get_peft_config",
+        "get_quantization_config",
+    ],
+}
+
+if TYPE_CHECKING:
+    from .data_utils import (
+        apply_chat_template,
+        extract_prompt,
+        is_conversational,
+        is_conversational_from_value,
+        maybe_apply_chat_template,
+        maybe_convert_to_chatml,
+        maybe_extract_prompt,
+        maybe_unpair_preference_dataset,
+        pack_dataset,
+        prepare_multimodal_messages,
+        prepare_multimodal_messages_vllm,
+        truncate_dataset,
+        unpair_preference_dataset,
+    )
+    from .models import (
+        SUPPORTED_ARCHITECTURES,
+        AutoModelForCausalLMWithValueHead,
+        AutoModelForSeq2SeqLMWithValueHead,
+        PreTrainedModelWrapper,
+        clone_chat_template,
+        create_reference_model,
+        setup_chat_format,
+    )
+    from .scripts import (
+        DatasetMixtureConfig,
+        ScriptArguments,
+        TrlParser,
+        get_dataset,
+        init_zero_verbose,
+    )
+    from .trainer import (
+        AllTrueJudge,
+        BaseBinaryJudge,
+        BaseJudge,
+        BasePairwiseJudge,
+        BaseRankJudge,
+        BCOConfig,
+        BCOTrainer,
+        CPOConfig,
+        CPOTrainer,
+        DPOConfig,
+        DPOTrainer,
+        FDivergenceConstants,
+        FDivergenceType,
+        GKDConfig,
+        GKDTrainer,
+        GRPOConfig,
+        GRPOTrainer,
+        HfPairwiseJudge,
+        KTOConfig,
+        KTOTrainer,
+        LogCompletionsCallback,
+        ModelConfig,
+        NashMDConfig,
+        NashMDTrainer,
+        OnlineDPOConfig,
+        OnlineDPOTrainer,
+        OpenAIPairwiseJudge,
+        ORPOConfig,
+        ORPOTrainer,
+        PairRMJudge,
+        PPOConfig,
+        PPOTrainer,
+        PRMConfig,
+        PRMTrainer,
+        RewardConfig,
+        RewardTrainer,
+        RLOOConfig,
+        RLOOTrainer,
+        SFTConfig,
+        SFTTrainer,
+        WinRateCallback,
+        XPOConfig,
+        XPOTrainer,
+    )
+    from .trainer.callbacks import (
+        BEMACallback,
+        MergeModelCallback,
+        RichProgressCallback,
+        SyncRefModelCallback,
+        WeaveCallback,
+    )
+    from .trainer.utils import (
+        get_kbit_device_map,
+        get_peft_config,
+        get_quantization_config,
+    )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={"__version__": __version__},
+    )
diff --git a/src/aixpert/training/training/trl/accelerate_configs/fsdp1.yaml b/src/aixpert/training/training/trl/accelerate_configs/fsdp1.yaml
new file mode 100644
index 0000000..c01b0b5
--- /dev/null
+++ b/src/aixpert/training/training/trl/accelerate_configs/fsdp1.yaml
@@ -0,0 +1,28 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: FULL_SHARD
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+  fsdp_version: 1
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/src/aixpert/training/training/trl/accelerate_configs/fsdp2.yaml b/src/aixpert/training/training/trl/accelerate_configs/fsdp2.yaml
new file mode 100644
index 0000000..af498f3
--- /dev/null
+++ b/src/aixpert/training/training/trl/accelerate_configs/fsdp2.yaml
@@ -0,0 +1,25 @@
+# Requires accelerate 1.7.0 or higher
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_version: 2
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/src/aixpert/training/training/trl/accelerate_configs/multi_gpu.yaml b/src/aixpert/training/training/trl/accelerate_configs/multi_gpu.yaml
new file mode 100644
index 0000000..15dad9b
--- /dev/null
+++ b/src/aixpert/training/training/trl/accelerate_configs/multi_gpu.yaml
@@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/src/aixpert/training/training/trl/accelerate_configs/single_gpu.yaml b/src/aixpert/training/training/trl/accelerate_configs/single_gpu.yaml
new file mode 100644
index 0000000..ebd00a0
--- /dev/null
+++ b/src/aixpert/training/training/trl/accelerate_configs/single_gpu.yaml
@@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: "NO"
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/src/aixpert/training/training/trl/accelerate_configs/zero1.yaml b/src/aixpert/training/training/trl/accelerate_configs/zero1.yaml
new file mode 100644
index 0000000..d5b5f78
--- /dev/null
+++ b/src/aixpert/training/training/trl/accelerate_configs/zero1.yaml
@@ -0,0 +1,20 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  zero3_init_flag: false
+  zero_stage: 1
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/src/aixpert/training/training/trl/accelerate_configs/zero2.yaml b/src/aixpert/training/training/trl/accelerate_configs/zero2.yaml
new file mode 100644
index 0000000..239b14a
--- /dev/null
+++ b/src/aixpert/training/training/trl/accelerate_configs/zero2.yaml
@@ -0,0 +1,21 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/src/aixpert/training/training/trl/accelerate_configs/zero3.yaml b/src/aixpert/training/training/trl/accelerate_configs/zero3.yaml
new file mode 100644
index 0000000..b5a1201
--- /dev/null
+++ b/src/aixpert/training/training/trl/accelerate_configs/zero3.yaml
@@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/src/aixpert/training/training/trl/cli.py b/src/aixpert/training/training/trl/cli.py
new file mode 100644
index 0000000..64a0d3a
--- /dev/null
+++ b/src/aixpert/training/training/trl/cli.py
@@ -0,0 +1,176 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from importlib import resources
+
+import torch
+from accelerate import logging
+from accelerate.commands.launch import launch_command, launch_command_parser
+
+from .scripts.dpo import make_parser as make_dpo_parser
+from .scripts.env import print_env
+from .scripts.grpo import make_parser as make_grpo_parser
+from .scripts.kto import make_parser as make_kto_parser
+from .scripts.reward import make_parser as make_reward_parser
+from .scripts.rloo import make_parser as make_rloo_parser
+from .scripts.sft import make_parser as make_sft_parser
+from .scripts.utils import TrlParser
+from .scripts.vllm_serve import main as vllm_serve_main
+from .scripts.vllm_serve import make_parser as make_vllm_serve_parser
+
+
+logger = logging.get_logger(__name__)
+
+
+def main():
+    parser = TrlParser(prog="TRL CLI", usage="trl", allow_abbrev=False)
+
+    # Add the subparsers
+    subparsers = parser.add_subparsers(
+        help="available commands", dest="command", parser_class=TrlParser
+    )
+
+    # Add the subparsers for every script
+    make_dpo_parser(subparsers)
+    subparsers.add_parser("env", help="Print the environment information")
+    make_grpo_parser(subparsers)
+    make_kto_parser(subparsers)
+    make_reward_parser(subparsers)
+    make_rloo_parser(subparsers)
+    make_sft_parser(subparsers)
+    make_vllm_serve_parser(subparsers)
+
+    # Parse the arguments; the remaining ones (`launch_args`) are passed to the 'accelerate launch' subparser.
+    # Duplicates may occur if the same argument is provided in both the config file and CLI.
+    # For example: launch_args = `["--num_processes", "4", "--num_processes", "8"]`.
+    # Deduplication and precedence (CLI over config) are handled later by launch_command_parser.
+    args, launch_args = parser.parse_args_and_config(return_remaining_strings=True)
+
+    # Replace `--accelerate_config foo` with `--config_file trl/accelerate_configs/foo.yaml` if it is present in the
+    # launch_args. It allows the user to use predefined accelerate configs from the `trl` package.
+    if "--accelerate_config" in launch_args:
+        # Get the index of the '--accelerate_config' argument and the corresponding config name
+        config_index = launch_args.index("--accelerate_config")
+        config_name = launch_args[config_index + 1]
+
+        # If the config_name correspond to a path in the filesystem, we don't want to override it
+        if os.path.isfile(config_name):
+            accelerate_config_path = config_name
+        elif (
+            resources.files("trl.accelerate_configs")
+            .joinpath(f"{config_name}.yaml")
+            .exists()
+        ):
+            # Get the predefined accelerate config path from the package resources
+            accelerate_config_path = resources.files("trl.accelerate_configs").joinpath(
+                f"{config_name}.yaml"
+            )
+        else:
+            raise ValueError(
+                f"Accelerate config {config_name} is neither a file nor a valid config in the `trl` package. "
+                "Please provide a valid config name or a path to a config file."
+            )
+
+        # Remove '--accelerate_config' and its corresponding config name
+        launch_args.pop(config_index)
+        launch_args.pop(config_index)
+
+        # Insert '--config_file' and the absolute path to the front of the list
+        launch_args = ["--config_file", str(accelerate_config_path)] + launch_args
+
+    if args.command == "dpo":
+        # Get the default args for the launch command
+        dpo_training_script = resources.files("trl.scripts").joinpath("dpo.py")
+        args = launch_command_parser().parse_args([str(dpo_training_script)])
+
+        # Feed the args to the launch command
+        args.training_script_args = sys.argv[2:]  # remove "trl" and "dpo"
+        launch_command(args)  # launch training
+
+    elif args.command == "env":
+        print_env()
+
+    elif args.command == "grpo":
+        # Get the default args for the launch command
+        grpo_training_script = resources.files("trl.scripts").joinpath("grpo.py")
+        args = launch_command_parser().parse_args([str(grpo_training_script)])
+
+        # Feed the args to the launch command
+        args.training_script_args = sys.argv[2:]  # remove "trl" and "grpo"
+        launch_command(args)  # launch training
+
+    elif args.command == "kto":
+        # Get the default args for the launch command
+        kto_training_script = resources.files("trl.scripts").joinpath("kto.py")
+        args = launch_command_parser().parse_args([str(kto_training_script)])
+
+        # Feed the args to the launch command
+        args.training_script_args = sys.argv[2:]  # remove "trl" and "kto"
+        launch_command(args)  # launch training
+
+    elif args.command == "reward":
+        # Get the default args for the launch command
+        reward_training_script = resources.files("trl.scripts").joinpath("reward.py")
+        args = launch_command_parser().parse_args([str(reward_training_script)])
+
+        # Feed the args to the launch command
+        args.training_script_args = sys.argv[2:]  # remove "trl" and "reward"
+        launch_command(args)  # launch training
+
+    elif args.command == "rloo":
+        # Get the default args for the launch command
+        rloo_training_script = resources.files("trl.scripts").joinpath("rloo.py")
+        args = launch_command_parser().parse_args([str(rloo_training_script)])
+
+        # Feed the args to the launch command
+        args.training_script_args = sys.argv[2:]  # remove "trl" and "rloo"
+        launch_command(args)  # launch training
+
+    elif args.command == "sft":
+        # Get the path to the training script
+        sft_training_script = resources.files("trl.scripts").joinpath("sft.py")
+
+        # This simulates running: `accelerate launch <launch args> sft.py <training script args>`.
+        # Note that the training script args may include launch-related arguments (e.g., `--num_processes`),
+        # but we rely on the script to ignore any that don't apply to it.
+        training_script_args = sys.argv[2:]  # Remove "trl" and "sft"
+        args = launch_command_parser().parse_args(
+            launch_args + [str(sft_training_script)] + training_script_args
+        )
+        launch_command(args)  # launch training
+
+    elif args.command == "vllm-serve":
+        (script_args,) = parser.parse_args_and_config()
+
+        # Known issue: Using DeepSpeed with tensor_parallel_size=1 and data_parallel_size>1 may cause a crash when
+        # launched via the CLI. Suggest running the module directly.
+        # More information: https://github.com/vllm-project/vllm/issues/17079
+        if (
+            script_args.tensor_parallel_size == 1
+            and script_args.data_parallel_size > 1
+            and torch.cuda.is_available()
+        ):
+            logger.warning(
+                "Detected configuration: tensor_parallel_size=1 and data_parallel_size>1. This setup is known to "
+                "cause a crash when using the `trl vllm-serve` CLI entry point. As a workaround, please run the "
+                "server using the module path instead: `python -m trl.scripts.vllm_serve`",
+            )
+
+        vllm_serve_main(script_args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/training/training/trl/data_utils.py b/src/aixpert/training/training/trl/data_utils.py
new file mode 100644
index 0000000..0f598bc
--- /dev/null
+++ b/src/aixpert/training/training/trl/data_utils.py
@@ -0,0 +1,1027 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from collections import defaultdict, deque
+from collections.abc import Callable, Sequence
+from itertools import takewhile
+from typing import Any, TypeVar
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.compute as pc
+import pyarrow.types
+from datasets import Dataset, DatasetDict
+from transformers import PreTrainedTokenizerBase, ProcessorMixin
+
+
+DatasetType = TypeVar("DatasetType", Dataset, DatasetDict)
+
+
+def prepare_multimodal_messages(
+    messages: list[dict[str, Any]], images: list
+) -> list[dict[str, Any]]:
+    # docstyle-ignore  # because <Image> is not parsable in the code block
+    """
+    Convert messages into a structured multimodal format and inject the provided images into the message contents.
+
+    Args:
+        messages (`list[dict[str, Any]]`):
+            Messages with `"role"` and `"content"`. Content may be a raw string before transformation. List of messages
+            a `"role"` key (`"system"`, `"user"`, or `"assistant"`) and a `"content"` key containing either a string or
+            a list of structured blocks if already prepared.
+        images (`list`):
+            List of image objects to insert.
+
+    Returns
+    -------
+        `list[dict[str, Any]]`: A deep-copied list of messages where every `"content"` value is a list of structured
+        content blocks, and all `"image"` placeholders are populated with the corresponding image objects.
+
+    Notes
+    -----
+        - When the input `messages` isn't already in the structured format, (i.e., all `"content"` values are strings),
+          the function transforms them into the structured format by wrapping text in `{"type": "text", "text": ...}`
+          and inserting `{"type": "image"}` placeholders for the images *before* the first user message.
+        - When the input `messages` is already in the structured format (i.e., all `"content"` values are lists of
+          structured blocks), the function only fills in the actual images in the existing `{"type": "image"}`
+          placeholders. If the number of placeholders does not match the number of provided images, an error is raised.
+
+    Example:
+    ```python
+    # Input
+    [
+        {"role": "user", "content": "What's in this image?"},
+        {"role": "assistant", "content": "It looks like a cat."},
+    ]
+
+    # Output, one image provided
+    [
+        {"role": "user", "content": [{"type": "image", "image": <PIL.Image.Image>}, {"type": "text", "text": "What's in this image?"}]},
+        {"role": "assistant", "content": [{"type": "text", "text": "It looks like a cat."}]},
+    ]
+    ```
+    """
+    messages = copy.deepcopy(messages)  # avoid modifying the original messages
+
+    # First, convert all messages to the structured format if needed, and insert image placeholders if needed
+    images_included = False
+    for message in messages:
+        if message["role"] == "system":
+            if isinstance(
+                message["content"], str
+            ):  # if already prepared, the content will be a list
+                message["content"] = [{"type": "text", "text": message["content"]}]
+        elif message["role"] == "user":
+            if isinstance(message["content"], str) and not images_included:
+                image_entries = [{"type": "image"} for _ in range(len(images))]
+                message["content"] = [
+                    *image_entries,
+                    {"type": "text", "text": message["content"]},
+                ]
+                images_included = True
+            elif isinstance(message["content"], str) and images_included:
+                message["content"] = [{"type": "text", "text": message["content"]}]
+        elif message["role"] == "assistant":
+            if isinstance(message["content"], str):
+                message["content"] = [{"type": "text", "text": message["content"]}]
+        else:
+            raise ValueError(
+                f"Invalid role in message: {message['role']}. Expected 'user', 'assistant', or 'system'."
+            )
+
+    # Then, check that the number of image placeholders matches the number of images provided
+    num_placeholders = sum(
+        sum(1 for part in message["content"] if part["type"] == "image")
+        for message in messages
+    )
+    if num_placeholders != len(images):
+        raise ValueError(
+            f"Number of images provided ({len(images)}) does not match number of image placeholders ({num_placeholders})."
+        )
+
+    # Then, fill in the actual images in the placeholders
+    img_idx = 0
+    for message in messages:
+        for part in message["content"]:
+            if part["type"] == "image":
+                part["image"] = images[img_idx]
+                img_idx += 1
+
+    return messages
+
+
+def prepare_multimodal_messages_vllm(
+    messages: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    # docstyle-ignore  # because <Image> is not parsable in the code block
+    """
+    Convert structured multimodal messages into a format compatible with vLLM. Replaces `"type": "image"` blocks with
+    `"type": "image_pil"` blocks, and `"image": Image` with `"image_pil": Image`.
+
+    Args:
+        messages (`list[dict[str, Any]]`):
+            Messages with `"role"` and `"content"`. Content is expected to be a list of structured blocks.
+
+    Returns
+    -------
+        `list[dict[str, Any]]`:
+            A deep-copied list of messages compatible with vLLM's expected input format.
+
+    Example:
+    ```python
+    # Input
+    [{"role": "user", "content": [{"type": "image", "image": <PIL.Image.Image>}, {"type": "text", "text": "What's in this image?"}]}]
+
+    # Output
+    [{"role": "user", "content": [{"type": "image_pil", "image_pil": <PIL.Image.Image>}, {"type": "text", "text": "What's in this image?"}]}]
+    ```
+    """
+    messages = copy.deepcopy(messages)  # avoid modifying the original messages
+    for message in messages:
+        if isinstance(message["content"], list):
+            for part in message["content"]:
+                if part["type"] == "image":
+                    part["type"] = (
+                        "image_pil"  # vLLM expects 'image_pil' key for images
+                    )
+                    part["image_pil"] = part.pop("image")
+    return messages
+
+
+def is_conversational(example: dict[str, Any]) -> bool:
+    r"""
+    Check if the example is in a conversational format.
+
+    Args:
+        example (`dict[str, Any]`):
+            A single data entry of a dataset. The example can have different keys depending on the dataset type.
+
+    Returns
+    -------
+        `bool`:
+            `True` if the data is in a conversational format, `False` otherwise.
+
+    Examples
+    --------
+    ```python
+    >>> example = {"prompt": [{"role": "user", "content": "What color is the sky?"}]}
+    >>> is_conversational(example)
+    True
+
+    >>> example = {"prompt": "The sky is"}
+    >>> is_conversational(example)
+    False
+    ```
+    """
+    supported_keys = ["prompt", "chosen", "rejected", "completion", "messages"]
+    example_keys = {key for key in example if key in supported_keys}
+
+    # It must have one of the supported keys
+    if example_keys:
+        key = example_keys.pop()  # take the first supported key
+        maybe_messages = example[key]
+        # It must be a list of messages
+        if isinstance(maybe_messages, list):
+            maybe_message = maybe_messages[0]
+            # Each message must a list of dictionaries with keys "role" and "content"
+            if (
+                isinstance(maybe_message, dict)
+                and "role" in maybe_message
+                and "content" in maybe_message
+            ):
+                return True
+
+    return False
+
+
+def apply_chat_template(
+    example: dict[str, list[dict[str, str]]],
+    tokenizer: PreTrainedTokenizerBase | ProcessorMixin,
+    tools: list[dict | Callable] | None = None,
+    **template_kwargs,
+) -> dict[str, str]:
+    r"""
+    Apply a chat template to a conversational example along with the schema for a list of functions in `tools`.
+
+    For more details, see [`maybe_apply_chat_template`].
+    """
+    # Check that the example has the correct keys
+    supported_keys = ["prompt", "chosen", "rejected", "completion", "messages", "label"]
+    example_keys = {key for key in example if key in supported_keys}
+    if example_keys not in [
+        {"messages"},  # language modeling
+        {"prompt"},  # prompt-only
+        {"prompt", "completion"},  # prompt-completion
+        {"prompt", "chosen", "rejected"},  # preference
+        {"chosen", "rejected"},  # preference with implicit prompt
+        {"prompt", "completion", "label"},  # unpaired preference
+    ]:
+        raise KeyError(f"Invalid keys in the example: {example_keys}")
+
+    # Apply the chat template to the whole conversation
+    if "messages" in example:
+        messages = tokenizer.apply_chat_template(
+            example["messages"],
+            tools=tools,
+            tokenize=False,
+            **example.get("chat_template_kwargs", {}),
+            **template_kwargs,
+        )
+
+    # Apply the chat template to the prompt, adding the generation prompt
+    if "prompt" in example:
+        last_role = example["prompt"][-1]["role"]
+        if last_role == "user":
+            add_generation_prompt = True
+            continue_final_message = False
+        elif last_role == "assistant":
+            add_generation_prompt = False
+            continue_final_message = True
+        else:
+            raise ValueError(f"Invalid role in the last message: {last_role}")
+        prompt = tokenizer.apply_chat_template(
+            example["prompt"],
+            tools=tools,
+            continue_final_message=continue_final_message,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+            **example.get("chat_template_kwargs", {}),
+            **template_kwargs,
+        )
+
+    # Apply the chat template to the entire prompt + completion
+    if "prompt" in example:  # explicit prompt and prompt-completion case
+        if "chosen" in example:
+            prompt_chosen = tokenizer.apply_chat_template(
+                example["prompt"] + example["chosen"],
+                tools=tools,
+                tokenize=False,
+                **example.get("chat_template_kwargs", {}),
+                **template_kwargs,
+            )
+            # DeepSeek-R1 inserts a <tool_call> token when using `add_generation_prompt`, which can cause discrepancies
+            # between the prompt alone and the combined prompt+completion. To ensure consistency, we extract the
+            # common prefix between the two. In most cases, this is a no-op.
+            prompt = "".join(
+                x
+                for x, _ in takewhile(
+                    lambda x: x[0] == x[1], zip(prompt, prompt_chosen, strict=False)
+                )
+            )
+
+            chosen = prompt_chosen[len(prompt) :]
+        if "rejected" in example and "prompt" in example:  # explicit prompt
+            prompt_rejected = tokenizer.apply_chat_template(
+                example["prompt"] + example["rejected"],
+                tools=tools,
+                tokenize=False,
+                **example.get("chat_template_kwargs", {}),
+                **template_kwargs,
+            )
+            # Handle DeepSeek-R1 <tool_call> token, see the above comment for details
+            prompt = "".join(
+                x
+                for x, _ in takewhile(
+                    lambda x: x[0] == x[1], zip(prompt, prompt_rejected, strict=False)
+                )
+            )
+            rejected = prompt_rejected[len(prompt) :]
+        if "completion" in example:
+            prompt_completion = tokenizer.apply_chat_template(
+                example["prompt"] + example["completion"],
+                tools=tools,
+                tokenize=False,
+                **example.get("chat_template_kwargs", {}),
+                **template_kwargs,
+            )
+            # Handle DeepSeek-R1 <tool_call> token, see the above comment for details
+            prompt = "".join(
+                x
+                for x, _ in takewhile(
+                    lambda x: x[0] == x[1], zip(prompt, prompt_completion, strict=False)
+                )
+            )
+            completion = prompt_completion[len(prompt) :]
+    else:  # implicit prompt case
+        if "chosen" in example:
+            chosen = tokenizer.apply_chat_template(
+                example["chosen"],
+                tools=tools,
+                tokenize=False,
+                **example.get("chat_template_kwargs", {}),
+                **template_kwargs,
+            )
+        if "rejected" in example:
+            rejected = tokenizer.apply_chat_template(
+                example["rejected"],
+                tools=tools,
+                tokenize=False,
+                **example.get("chat_template_kwargs", {}),
+                **template_kwargs,
+            )
+
+    # Extract the completion by removing the prompt part from the prompt-completion string
+    output = {}
+    if "messages" in example:
+        output["text"] = messages
+    if "prompt" in example:
+        output["prompt"] = prompt
+    if "chosen" in example:
+        output["chosen"] = chosen
+    if "rejected" in example:
+        output["rejected"] = rejected
+    if "completion" in example:
+        output["completion"] = completion
+    if "label" in example:
+        output["label"] = example["label"]
+
+    return output
+
+
+def maybe_apply_chat_template(
+    example: dict[str, list[dict[str, str]]],
+    tokenizer: PreTrainedTokenizerBase,
+    tools: list[dict | Callable] | None = None,
+    **template_kwargs: Any,
+) -> dict[str, str]:
+    r"""
+    If the example is in a conversational format, apply a chat template to it.
+
+    Args:
+        example (`dict[str, list[dict[str, str]]`):
+            Dictionary representing a single data entry of a conversational dataset. Each data entry can have different
+            keys depending on the dataset type. The supported dataset types are:
+
+                - Language modeling dataset: `"messages"`.
+                - Prompt-only dataset: `"prompt"`.
+                - Prompt-completion dataset: `"prompt"` and `"completion"`.
+                - Preference dataset: `"prompt"`, `"chosen"`, and `"rejected"`.
+                - Preference dataset with implicit prompt: `"chosen"` and `"rejected"`.
+                - Unpaired preference dataset: `"prompt"`, `"completion"`, and `"label"`.
+
+            For keys `"messages"`, `"prompt"`, `"chosen"`, `"rejected"`, and `"completion"`, the values are lists of
+            messages, where each message is a dictionary with keys `"role"` and `"content"`. Additionally, the example
+            may contain a `"chat_template_kwargs"` key, which is a dictionary of additional keyword arguments to pass
+            to the chat template renderer.
+        tokenizer ([`~transformers.PreTrainedTokenizerBase`]):
+            Tokenizer to apply the chat template with.
+        tools (`list[dict | Callable]`, *optional*):
+            A list of tools (callable functions) that will be accessible to the model. If the template does not support
+            function calling, this argument will have no effect.
+        **template_kwargs (`Any`, *optional*):
+            Additional kwargs to pass to the template renderer. Will be accessible by the chat template.
+
+    Returns
+    -------
+        `dict[str, str]`:
+            Formatted example with the chat template applied.
+
+    Notes
+    -----
+        - This function does not alter the keys, except for Language modeling dataset, where `"messages"` is replaced
+        by `"text"`.
+
+        - In case of prompt-only data, if the last role is `"user"`, the generation prompt is added to the prompt.
+        Else, if the last role is `"assistant"`, the final message is continued.
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
+    >>> example = {
+    ...     "prompt": [{"role": "user", "content": "What color is the sky?"}],
+    ...     "completion": [{"role": "assistant", "content": "It is blue."}],
+    ... }
+    >>> apply_chat_template(example, tokenizer)
+    {'prompt': '<|user|>\nWhat color is the sky?<|end|>\n<|assistant|>\n', 'completion': 'It is blue.<|end|>\n'}
+    ```
+    """
+    if is_conversational(example):
+        return apply_chat_template(example, tokenizer, tools, **template_kwargs)
+    return example
+
+
+def _unpair_row(
+    examples: list[dict[str, list[dict[str, str]]]],
+) -> list[dict[str, list[dict[str, str]]]]:
+    batch_size = len(examples["chosen"])
+    new_rows = {
+        "completion": examples["chosen"] + examples["rejected"],
+        "label": [True] * batch_size + [False] * batch_size,
+    }
+    if "prompt" in examples:
+        new_rows["prompt"] = examples["prompt"] + examples["prompt"]
+    return new_rows
+
+
+def unpair_preference_dataset(
+    dataset: DatasetType, num_proc: int | None = None, desc: str | None = None
+) -> DatasetType:
+    r"""
+    Unpair a preference dataset.
+
+    Args:
+        dataset ([`~datasets.Dataset`] or [`~datasets.DatasetDict`]):
+            Preference dataset to unpair. The dataset must have columns `"chosen"`, `"rejected"` and optionally
+            `"prompt"`.
+        num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+        desc (`str`, *optional*):
+            Meaningful description to be displayed alongside with the progress bar while mapping examples.
+
+    Returns
+    -------
+        [`~datasets.Dataset`]: The unpaired preference dataset.
+
+    Example:
+
+    ```python
+    >>> from datasets import Dataset
+
+    >>> dataset_dict = {
+    ...     "prompt": ["The sky is", "The sun is"],
+    ...     "chosen": [" blue.", "in the sky."],
+    ...     "rejected": [" green.", " in the sea."],
+    ... }
+    >>> dataset = Dataset.from_dict(dataset_dict)
+    >>> dataset = unpair_preference_dataset(dataset)
+    >>> dataset
+    Dataset({
+        features: ['prompt', 'completion', 'label'],
+        num_rows: 4
+    })
+
+    >>> dataset[0]
+    {'prompt': 'The sky is', 'completion': ' blue.', 'label': True}
+    ```
+    """
+    return dataset.map(
+        _unpair_row,
+        batched=True,
+        remove_columns=["chosen", "rejected"],
+        num_proc=num_proc,
+        desc=desc,
+    )
+
+
+def maybe_unpair_preference_dataset(
+    dataset: DatasetType, num_proc: int | None = None, desc: str | None = None
+) -> DatasetType:
+    r"""
+    Unpair a preference dataset if it is paired.
+
+    Args:
+        dataset ([`~datasets.Dataset`] or [`~datasets.DatasetDict`]):
+            Preference dataset to unpair. The dataset must have columns `"chosen"`, `"rejected"` and optionally
+            `"prompt"`.
+        num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+        desc (`str`, *optional*):
+            Meaningful description to be displayed alongside with the progress bar while mapping examples.
+
+    Returns
+    -------
+        [`~datasets.Dataset`] or [`~datasets.DatasetDict`]: The unpaired preference dataset if it was paired, otherwise
+        the original dataset.
+
+    Example:
+
+    ```python
+    >>> from datasets import Dataset
+
+    >>> dataset_dict = {
+    ...     "prompt": ["The sky is", "The sun is"],
+    ...     "chosen": [" blue.", "in the sky."],
+    ...     "rejected": [" green.", " in the sea."],
+    ... }
+    >>> dataset = Dataset.from_dict(dataset_dict)
+    >>> dataset = unpair_preference_dataset(dataset)
+    >>> dataset
+    Dataset({
+        features: ['prompt', 'completion', 'label'],
+        num_rows: 4
+    })
+
+    >>> dataset[0]
+    {'prompt': 'The sky is', 'completion': ' blue.', 'label': True}
+    ```
+    """
+    if isinstance(dataset, DatasetDict):
+        column_names = dataset[list(dataset.keys())[0]].column_names
+    else:
+        column_names = dataset.column_names
+    if "chosen" in column_names and "rejected" in column_names:
+        return unpair_preference_dataset(dataset, num_proc=num_proc, desc=desc)
+    return dataset
+
+
+def extract_prompt(example: dict[str, Sequence]) -> dict[str, Sequence]:
+    r"""
+    Extracts the shared prompt from a preference data example, where the prompt is implicit within both the chosen and
+    rejected completions.
+
+    For more details, see [`maybe_extract_prompt`].
+    """
+    for idx in range(min(len(example["chosen"]), len(example["rejected"]))):
+        if example["chosen"][idx] != example["rejected"][idx]:
+            if example["chosen"][idx - 1] == " ":  # remove space before the prompt
+                idx -= 1
+            break
+    return {
+        "prompt": example["chosen"][:idx],
+        "chosen": example["chosen"][idx:],
+        "rejected": example["rejected"][idx:],
+    }
+
+
+def maybe_extract_prompt(example: dict[str, list]) -> dict[str, list]:
+    r"""
+    Extracts the shared prompt from a preference data example, where the prompt is implicit within both the chosen and
+    rejected completions.
+
+    If the example already contains a `"prompt"` key, the function returns the example as is. Else, the function
+    identifies the longest common sequence (prefix) of conversation turns between the "chosen" and "rejected"
+    completions and extracts this as the prompt. It then removes this prompt from the respective "chosen" and
+    "rejected" completions.
+
+    Args:
+        example (`dict[str, list]`):
+            A dictionary representing a single data entry in the preference dataset. It must contain the keys
+            `"chosen"` and `"rejected"`, where each value is either conversational or standard (`str`).
+
+    Returns
+    -------
+        `dict[str, list]`: A dictionary containing:
+            - `"prompt"`: The longest common prefix between the "chosen" and "rejected" completions.
+            - `"chosen"`: The remainder of the "chosen" completion, with the prompt removed.
+            - `"rejected"`: The remainder of the "rejected" completion, with the prompt removed.
+
+    Examples
+    --------
+    ```python
+    >>> example = {
+    ...     "chosen": [
+    ...         {"role": "user", "content": "What color is the sky?"},
+    ...         {"role": "assistant", "content": "It is blue."},
+    ...     ],
+    ...     "rejected": [
+    ...         {"role": "user", "content": "What color is the sky?"},
+    ...         {"role": "assistant", "content": "It is green."},
+    ...     ],
+    ... }
+    >>> extract_prompt(example)
+    {'prompt': [{'role': 'user', 'content': 'What color is the sky?'}],
+     'chosen': [{'role': 'assistant', 'content': 'It is blue.'}],
+     'rejected': [{'role': 'assistant', 'content': 'It is green.'}]}
+    ```
+
+    Or, with the `map` method of [`~datasets.Dataset`]:
+
+    ```python
+    >>> from trl import extract_prompt
+    >>> from datasets import Dataset
+
+    >>> dataset_dict = {
+    ...     "chosen": [
+    ...         [
+    ...             {"role": "user", "content": "What color is the sky?"},
+    ...             {"role": "assistant", "content": "It is blue."},
+    ...         ],
+    ...         [
+    ...             {"role": "user", "content": "Where is the sun?"},
+    ...             {"role": "assistant", "content": "In the sky."},
+    ...         ],
+    ...     ],
+    ...     "rejected": [
+    ...         [
+    ...             {"role": "user", "content": "What color is the sky?"},
+    ...             {"role": "assistant", "content": "It is green."},
+    ...         ],
+    ...         [
+    ...             {"role": "user", "content": "Where is the sun?"},
+    ...             {"role": "assistant", "content": "In the sea."},
+    ...         ],
+    ...     ],
+    ... }
+    >>> dataset = Dataset.from_dict(dataset_dict)
+    >>> dataset = dataset.map(extract_prompt)
+    >>> dataset[0]
+    {'prompt': [{'role': 'user', 'content': 'What color is the sky?'}],
+     'chosen': [{'role': 'assistant', 'content': 'It is blue.'}],
+     'rejected': [{'role': 'assistant', 'content': 'It is green.'}]}
+    ```
+    """
+    # Some dataset add a `"prompt"` column, even though the prompt is implicit and included in the "chosen" and
+    # "rejected" completions. E.g.:
+    # {"prompt": "What color is the sky?",
+    #  "chosen": [{"role": "user", "content": "What color is the sky?"}, {"role": "assistant", "content": "It is blue."}],
+    #  "rejected": [{"role": "user", "content": "What color is the sky?"}, {"role": "assistant", "content": "It is green."}]}
+    # That's why we check if the prompt is also conversational before deciding not to extract it.
+    if "chosen" not in example or "rejected" not in example:  # not a preference example
+        return example
+    if "prompt" in example:
+        # Both conversational or both non-conversational
+        chosen_conv = is_conversational({"chosen": example["chosen"]})
+        prompt_conv = is_conversational({"prompt": example["prompt"]})
+        if (chosen_conv and prompt_conv) or (not chosen_conv and not prompt_conv):
+            return example
+    return extract_prompt(
+        {"chosen": example["chosen"], "rejected": example["rejected"]}
+    )
+
+
+class _SegmentTree:
+    """
+    A segment tree data structure that, when initialized as `_SegmentTree(maxval)`, efficiently finds the next larger
+    value for a given input within the range [1, maxval].
+
+    See [Fewer Truncations Improve Language Modeling](https://arxiv.org/abs/2404.10830) for more details.
+    """
+
+    def __init__(self, maxval: int):
+        self.maxval = maxval
+        # For non-power-of-2 values, we need to round up to the next power of 2 for the tree size
+        self.tree_size = 1 << (maxval - 1).bit_length()
+        self.tree = [0] * (2 * self.tree_size)
+
+    def add(self, val):
+        assert 0 < val <= self.maxval
+        i = self.tree_size + val - 1
+        self.tree[i] = val
+        while i > 1:
+            i >>= 1
+            left, right = self.tree[i << 1], self.tree[(i << 1) + 1]
+            # Compare the values using if-else otherwise repeated calls to `builtins.max` become the bottleneck
+            self.tree[i] = left if left >= right else right
+
+    def remove(self, val):
+        assert 0 < val <= self.maxval
+        i = self.tree_size + val - 1
+        self.tree[i] = 0
+        while i > 1:
+            i >>= 1
+            left, right = self.tree[i << 1], self.tree[(i << 1) + 1]
+            # Compare the values using if-else otherwise repeated calls to `builtins.max` become the bottleneck
+            self.tree[i] = left if left >= right else right
+
+    def search(self, val):
+        assert 0 < val <= self.maxval
+        i = 1
+        while i < self.tree_size:
+            if self.tree[i << 1] >= val:
+                i = i << 1
+            else:
+                i = (i << 1) + 1
+        return self.tree[i]
+
+
+def _pack_bfd(examples: pa.Table, seq_length: int) -> pa.Table:
+    """Pack sequences in a pyarrow Table using Best Fit Decreasing strategy."""
+    columns = []
+    list_column_idx = None
+    for idx, column in enumerate(examples.columns):
+        if pyarrow.types.is_list(column.type) or pyarrow.types.is_large_list(
+            column.type
+        ):
+            column = pc.list_slice(column, 0, seq_length)
+            if list_column_idx is None:
+                list_column_idx = idx
+        columns.append(column)
+    examples = pa.Table.from_arrays(columns, names=examples.column_names)
+
+    ids = np.arange(len(examples))
+    assert list_column_idx is not None
+    lengths = pc.list_value_length(examples[list_column_idx]).combine_chunks()
+    examples = examples.append_column(
+        "seq_lengths", lengths
+    )  # Allows us to later construct `position_ids`
+    lengths = pc.make_struct(lengths, ids)
+    lengths = lengths.sort("descending", by=0)
+
+    segment_tree = _SegmentTree(seq_length)
+    segment_tree.add(seq_length)  # the max, `seq_length` bin is always available
+    space_to_bin = defaultdict(deque)
+
+    # Bin is represented as a dict (of example ids and sum of their lengths) to allow in-place updates
+    bins: list[dict] = []
+    for length, idx in zip(
+        lengths.field(0).to_numpy(), lengths.field(1).to_numpy(), strict=True
+    ):
+        space = segment_tree.search(length)
+
+        if space < seq_length:
+            # Use existing bin with exactly this amount of space
+            bin = space_to_bin[space].popleft()
+        else:
+            # Create a new bin
+            bin = {"ids": [], "length": 0}
+            bins.append(bin)
+
+        bin["ids"].append(idx)
+        bin["length"] += length
+        if space < seq_length and not space_to_bin[space]:
+            segment_tree.remove(space)
+
+        space = space - length
+        space_to_bin[space].append(bin)
+        if space > 0:
+            segment_tree.add(space)
+
+    examples = pc.take(examples, [id_ for bin in bins for id_ in bin["ids"]])
+    offsets = np.array([0] + [bin["length"] for bin in bins])
+    offsets = np.cumsum(offsets)
+
+    assert all(
+        column.num_chunks == 1 for column in examples.columns
+    )  # `pc.take` returns a ChunkedArray with a single chunk
+
+    lengths = examples["seq_lengths"].chunks[0]
+    examples = examples.drop_columns("seq_lengths")
+    lengths = pa.ListArray.from_arrays(
+        np.cumsum([0] + [len(bin["ids"]) for bin in bins], dtype=np.int32), lengths
+    )
+
+    columns = []
+    for column in examples.columns:
+        column = column.chunks[0]
+        if pa.types.is_list(column.type) or pa.types.is_large_list(column.type):
+            dtype = column.offsets.type.to_pandas_dtype()
+            column = type(column).from_arrays(offsets.astype(dtype), column.values)
+        columns.append(column)
+    return pa.Table.from_arrays(
+        columns + [lengths], names=examples.column_names + ["seq_lengths"]
+    )
+
+
+def _pack_wrapped(examples: pa.Table, seq_length: int) -> pa.Table:
+    """Pack sequences in a pyarrow Table using a wrapped strategy."""
+    columns = []
+    for column in examples.columns:
+        if pyarrow.types.is_list(column.type) or pyarrow.types.is_large_list(
+            column.type
+        ):
+            if isinstance(column, pa.ChunkedArray):
+                column = column.combine_chunks()
+            offsets, values = column.offsets, column.values
+            values = values[offsets[0].as_py() : offsets[-1].as_py()]
+            num_elements = len(values)
+            dtype = offsets.type.to_pandas_dtype()  # np.int32 or np.int64
+            offsets = np.arange(0, num_elements, seq_length, dtype=dtype)
+            offsets = np.concatenate((offsets, [num_elements]))
+            column = type(column).from_arrays(offsets, values)
+        columns.append(column)
+    return pa.Table.from_arrays(columns, names=examples.column_names)
+
+
+def pack_dataset(
+    dataset: DatasetType,
+    seq_length: int,
+    strategy: str = "bfd",
+    map_kwargs: dict[str, Any] | None = None,
+) -> DatasetType:
+    r"""
+    Pack sequences in a dataset into chunks of size `seq_length`.
+
+    Args:
+        dataset ([`~datasets.Dataset`] or [`~datasets.DatasetDict`]):
+            Dataset to pack
+        seq_length (`int`):
+            Target sequence length to pack to.
+        strategy (`str`, *optional*, defaults to `"bfd"`):
+            Packing strategy to use. Can be either:
+
+            - `"bfd"` (Best Fit Decreasing): Slower but preserves sequence boundaries. Sequences are never cut in the
+                middle.
+            - `"wrapped"`: Faster but more aggressive. Ignores sequence boundaries and will cut sequences in the middle
+                to completely fill each packed sequence with data.
+        map_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the dataset's map method when packing examples.
+
+    Returns
+    -------
+        [`~datasets.Dataset`] or [`~datasets.DatasetDict`]: The dataset with packed sequences. The number of examples
+        may decrease as sequences are combined.
+
+    Example:
+    ```python
+    >>> from datasets import Dataset
+    >>> from trl import pack_dataset
+
+    >>> examples = {
+    ...     "input_ids": [[1, 2, 3], [4, 5], [6, 7, 8], [9]],
+    ...     "attention_mask": [[1, 1, 0], [1, 0], [1, 0, 0], [1]],
+    ... }
+    >>> dataset = Dataset.from_dict(examples)
+    >>> packed_dataset = pack_dataset(dataset, seq_length=4, strategy="bfd")
+    >>> packed_dataset[:]
+    {'input_ids': [[1, 2, 3, 9], [6, 7, 8], [4, 5]],
+    'attention_mask': [[1, 1, 0, 1], [1, 0, 0], [1, 0]],
+    'seq_lengths': [[3, 1], [3], [2]]}
+    ```
+    """
+    if map_kwargs is None:
+        map_kwargs = {}
+    # Fast packing with pyarrow
+    dataset = dataset.with_format("arrow")
+    if strategy == "bfd":
+        dataset = dataset.map(
+            _pack_bfd, batched=True, fn_kwargs={"seq_length": seq_length}, **map_kwargs
+        )
+    elif strategy == "wrapped":
+        dataset = dataset.map(
+            _pack_wrapped,
+            batched=True,
+            fn_kwargs={"seq_length": seq_length},
+            **map_kwargs,
+        )
+    else:
+        raise ValueError(
+            f"Invalid packing strategy: {strategy}. Use 'bfd' or 'wrapped'."
+        )
+    dataset = dataset.with_format(None)
+    return dataset
+
+
+def truncate_dataset(
+    dataset: DatasetType, max_length: int, map_kwargs: dict[str, Any] | None = None
+) -> DatasetType:
+    r"""
+    Truncate sequences in a dataset to a specified `max_length`.
+
+    Args:
+        dataset ([`~datasets.Dataset`] or [`~datasets.DatasetDict`]):
+            Dataset to truncate.
+        max_length (`int`):
+            Maximum sequence length to truncate to.
+        map_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the dataset's map method when truncating examples.
+
+    Returns
+    -------
+        [`~datasets.Dataset`] or [`~datasets.DatasetDict`]: The dataset with truncated sequences.
+
+    Example:
+    ```python
+    >>> from datasets import Dataset
+
+    >>> examples = {
+    ...     "input_ids": [[1, 2, 3], [4, 5, 6, 7], [8]],
+    ...     "attention_mask": [[0, 1, 1], [0, 0, 1, 1], [1]],
+    ... }
+    >>> dataset = Dataset.from_dict(examples)
+    >>> truncated_dataset = truncate_dataset(dataset, max_length=2)
+    >>> truncated_dataset[:]
+    {'input_ids': [[1, 2], [4, 5], [8]],
+     'attention_mask': [[0, 1], [0, 0], [1]]}
+    ```
+    """
+    if map_kwargs is None:
+        map_kwargs = {}
+    if isinstance(dataset, Dataset):
+        # Fast truncation with pyarrow
+        def truncate(examples):
+            truncated_columns = []
+            for column in examples.columns:
+                if pyarrow.types.is_list(column.type) or pyarrow.types.is_large_list(
+                    column.type
+                ):
+                    column = pc.list_slice(column, 0, max_length)
+                truncated_columns.append(column)
+            return pa.Table.from_arrays(truncated_columns, names=examples.column_names)
+
+        dataset = dataset.with_format("arrow")
+        dataset = dataset.map(truncate, batched=True, **map_kwargs)
+        dataset = dataset.with_format(None)
+    else:
+
+        def truncate(examples):
+            truncated_examples = {}
+            for key, column in examples.items():
+                if column and isinstance(column[0], list):
+                    column = [val[:max_length] for val in column]
+                truncated_examples[key] = column
+            return truncated_examples
+
+        dataset = dataset.map(
+            truncate,
+            batched=True,
+            **map_kwargs,
+        )
+    return dataset
+
+
+def is_conversational_from_value(example: dict[str, Any]) -> bool:
+    r"""
+    Check if the example is in a conversational format (from/value). Note that this format isn't recommended. Prefer
+    the ChatML format (role/content)
+
+    Args:
+        example (`dict[str, Any]`):
+            A single data entry of a dataset. The example can have different keys depending on the dataset type.
+
+    Returns
+    -------
+        `bool`:
+            `True` if the data is in a conversational Chatformat, `False` otherwise.
+
+    Examples
+    --------
+    ```python
+    >>> example = {"conversations": [{"from": "user", "value": "What color is the sky?"}]}
+    >>> is_conversational_from_value(example)
+    True
+
+    >>> example = {"conversations": [{"role": "user", "content": "What color is the sky?"}]}
+    >>> is_conversational_from_value(example)
+    False
+
+    >>> example = {"conversations": "The sky is"}
+    >>> is_conversational_from_value(example)
+    False
+    ```
+    """
+    maybe_messages = example.get("conversations")
+    # It must be a list of messages
+    if isinstance(maybe_messages, list):
+        maybe_message = maybe_messages[0]
+        # Each message must a list of dictionaries with keys "from" and "value"
+        if (
+            isinstance(maybe_message, dict)
+            and "from" in maybe_message
+            and "value" in maybe_message
+        ):
+            return True
+
+    return False
+
+
+def maybe_convert_to_chatml(example: dict[str, list]) -> dict[str, list]:
+    """
+    Convert a conversational dataset with fields `from` and `value` to ChatML format.
+
+    This function modifies conversational data to align with OpenAI's ChatML format:
+    - Replaces the key `"from"` with `"role"` in message dictionaries.
+    - Replaces the key `"value"` with `"content"` in message dictionaries.
+    - Renames `"conversations"` to `"messages"` for consistency with ChatML.
+
+    Args:
+        example (`dict[str, list]`):
+            A single data entry containing a list of messages.
+
+    Returns
+    -------
+        `dict[str, list]`:
+            Example reformatted to ChatML style.
+
+    Example:
+    ```python
+    >>> from trl import maybe_convert_to_chatml
+
+    >>> example = {
+    ...     "conversations": [
+    ...         {"from": "user", "value": "What color is the sky?"},
+    ...         {"from": "assistant", "value": "It is blue."},
+    ...     ]
+    ... }
+    >>> maybe_convert_to_chatml(example)
+    {'messages': [{'role': 'user', 'content': 'What color is the sky?'},
+                  {'role': 'assistant', 'content': 'It is blue.'}]}
+    ```
+    """
+    # List of possible keys containing message lists
+    for key in [
+        "prompt",
+        "completion",
+        "chosen",
+        "rejected",
+        "messages",
+        "conversations",
+    ]:
+        if key in example and isinstance(example[key], list):
+            messages = example[key]
+            for message in messages:
+                if isinstance(message, dict):
+                    if "from" in message:
+                        message["role"] = message.pop("from")
+                    if "value" in message:
+                        message["content"] = message.pop("value")
+
+    # Rename "conversations" to "messages"
+    if "conversations" in example:
+        example["messages"] = example.pop("conversations")
+
+    return example
diff --git a/src/aixpert/training/training/trl/experimental/__init__.py b/src/aixpert/training/training/trl/experimental/__init__.py
new file mode 100644
index 0000000..28f8f42
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Experimental submodule for TRL.
+
+This submodule contains unstable or incubating features. Anything here may change (or be removed) in any release
+without deprecation. Use at your own risk.
+
+To silence this notice set environment variable TRL_EXPERIMENTAL_SILENCE=1.
+"""
+
+import os
+import warnings
+
+
+if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+    warnings.warn(
+        "You are importing from 'trl.experimental'. APIs here are unstable and may change or be removed without "
+        "notice. Silence this warning by setting environment variable TRL_EXPERIMENTAL_SILENCE=1.",
+        UserWarning,
+        stacklevel=2,
+    )
diff --git a/src/aixpert/training/training/trl/experimental/bco/__init__.py b/src/aixpert/training/training/trl/experimental/bco/__init__.py
new file mode 100644
index 0000000..9f57889
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/bco/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .bco_config import BCOConfig
+from .bco_trainer import BCOTrainer
diff --git a/src/aixpert/training/training/trl/experimental/bco/bco_config.py b/src/aixpert/training/training/trl/experimental/bco/bco_config.py
new file mode 100644
index 0000000..523e5d6
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/bco/bco_config.py
@@ -0,0 +1,226 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class BCOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`BCOTrainer`].
+
+    This class includes only the parameters that are specific to BCO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_completion_length (`int`, *optional*):
+            Maximum length of the completion. This argument is required if you want to use the default data collator
+            and your model is an encoder-decoder.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model.
+        label_pad_token_id (`int`,  *optional*, defaults to `-100`):
+            Label pad token id. This argument is required if you want to use the default data collator.
+        padding_value (`int`, *optional*):
+            Padding value to use. If `None`, the padding value of the tokenizer is used.
+        truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model and reference model.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from both the model and the reference model to W&B or Comet
+            during evaluation.
+        is_encoder_decoder (`bool`, *optional*):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
+            Whether to precompute reference model log probabilities for training and evaluation datasets. This is
+            useful when training without the reference model to reduce the total GPU memory needed.
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        ref_model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
+            from a string.
+        dataset_num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+        prompt_sample_size (`int`, *optional*, defaults to `1024`):
+            Number of prompts that are fed to density ratio classifier.
+        min_density_ratio (`float`, *optional*, defaults to `0.5`):
+            Minimum value of the density ratio. The estimated density ratio is clamped to this value.
+        max_density_ratio (`float`, *optional*, defaults to `10.0`):
+            Maximum value of the density ratio. The estimated density ratio is clamped to this value.
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + [
+        "model_init_kwargs",
+        "ref_model_init_kwargs",
+    ]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    max_length: int | None = field(
+        default=1024,
+        metadata={
+            "help": "Maximum length of the sequences (prompt + completion) in the batch. "
+            "This argument is required if you want to use the default data collator."
+        },
+    )
+    max_prompt_length: int | None = field(
+        default=512,
+        metadata={
+            "help": "Maximum length of the prompt. "
+            "This argument is required if you want to use the default data collator."
+        },
+    )
+    max_completion_length: int | None = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the "
+            "default data collator and your model is an encoder-decoder."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. "
+            "Higher β means less deviation from the reference model."
+        },
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={
+            "help": "Label pad token id. This argument is required if you want to use the default data collator."
+        },
+    )
+    padding_value: int | None = field(
+        default=None,
+        metadata={
+            "help": "Padding value to use. If `None`, the padding value of the tokenizer is used."
+        },
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long. Possible values are "
+            "`keep_end` or `keep_start`. This argument is required if you want to use the "
+            "default data collator."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to disable dropout in the model and reference model."
+        },
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={
+            "help": "If `True`, generates and logs completions from both the model and the reference model "
+            "to W&B during evaluation."
+        },
+    )
+    is_encoder_decoder: bool | None = field(
+        default=None,
+        metadata={
+            "help": "When using the `model_init` argument (callable) to instantiate the model instead of the "
+            "`model` argument, you need to specify if the model returned by the callable is an "
+            "encoder-decoder model."
+        },
+    )
+    precompute_ref_log_probs: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to precompute reference model log probabilities for training and evaluation datasets. "
+            "This is useful when training without the reference model to reduce the total GPU memory "
+            "needed."
+        },
+    )
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "model from a string."
+        },
+    )
+    ref_model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "reference model from a string."
+        },
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    prompt_sample_size: int = field(
+        default=1024,
+        metadata={
+            "help": "Number of prompts that are fed to density ratio classifier."
+        },
+    )
+    min_density_ratio: float = field(
+        default=0.5,
+        metadata={
+            "help": "Minimum value of the density ratio. The estimated density ratio is clamped to this value."
+        },
+    )
+    max_density_ratio: float = field(
+        default=10.0,
+        metadata={
+            "help": "Maximum value of the density ratio. The estimated density ratio is clamped to this value."
+        },
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        super().__post_init__()
diff --git a/src/aixpert/training/training/trl/experimental/bco/bco_trainer.py b/src/aixpert/training/training/trl/experimental/bco/bco_trainer.py
new file mode 100644
index 0000000..c5c890e
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/bco/bco_trainer.py
@@ -0,0 +1,1777 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import random
+import textwrap
+import warnings
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import contextmanager, nullcontext
+from operator import itemgetter
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal, Optional
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from accelerate import PartialState, logging
+from accelerate.utils import tqdm
+from datasets import Dataset
+from torch import autocast, nn
+from torch.utils.data import DataLoader, SequentialSampler
+from transformers import (
+    AutoModelForCausalLM,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainingArguments,
+    is_comet_available,
+    is_sklearn_available,
+    is_wandb_available,
+)
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalLoopOutput, has_length
+from transformers.utils import is_peft_available
+
+from ...data_utils import (
+    maybe_apply_chat_template,
+    maybe_extract_prompt,
+    maybe_unpair_preference_dataset,
+)
+from ...import_utils import is_joblib_available
+from ...models import create_reference_model, prepare_deepspeed
+from ...trainer.base_trainer import BaseTrainer
+from ...trainer.utils import (
+    DPODataCollatorWithPadding,
+    RunningMoments,
+    disable_dropout_in_model,
+    log_table_to_comet_experiment,
+    pad_to_length,
+    peft_module_casting_to_bf16,
+    selective_log_softmax,
+)
+from .bco_config import BCOConfig
+
+
+if is_peft_available():
+    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
+
+if is_wandb_available():
+    import wandb
+
+if is_sklearn_available():
+    from sklearn.linear_model import LogisticRegression
+
+if is_joblib_available():
+    import joblib
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+logger = logging.get_logger(__name__)
+
+RUNNING_NAME = "running.json"
+CLF_NAME = "clf.pkl"
+
+
+def _tokenize(
+    batch: dict[str, list[Any]],
+    tokenizer: "PreTrainedTokenizer",
+    embedding_tokenizer: Optional["PreTrainedTokenizer"] = None,
+) -> dict[str, list[Any]]:
+    """Tokenize a batch from a BCO specific dataset."""
+    prompt_tokenized = tokenizer(batch["prompt"], add_special_tokens=False)
+    prompt_input_ids = prompt_tokenized["input_ids"]
+    prompt_attention_mask = prompt_tokenized["attention_mask"]
+    prompt_and_completion = [
+        prompt + completion
+        for prompt, completion in zip(batch["prompt"], batch["completion"], strict=True)
+    ]
+    full_tokenized = tokenizer(prompt_and_completion, add_special_tokens=False)
+    full_input_ids = full_tokenized["input_ids"]
+    full_attention_mask = full_tokenized["attention_mask"]
+
+    answer_input_ids = [
+        f[len(p) :] for f, p in zip(full_input_ids, prompt_input_ids, strict=True)
+    ]
+    answer_attention_mask = [
+        f[len(p) :]
+        for f, p in zip(full_attention_mask, prompt_attention_mask, strict=True)
+    ]
+
+    # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
+    full_concat_input_ids = [
+        np.concatenate([p, a])
+        for p, a in zip(prompt_input_ids, answer_input_ids, strict=True)
+    ]
+    # Prepare input tokens for token by token comparison
+    full_input_ids = [np.array(f) for f in full_input_ids]
+    for full, concat in zip(full_input_ids, full_concat_input_ids, strict=True):
+        if len(full) != len(concat):
+            raise ValueError(
+                "The elements in 'full_input_ids' and 'full_concat_input_ids' must have the same pairwise length."
+            )
+
+    # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
+    # can be merged together when tokenizing prompt+answer. This could result
+    # on the last token from the prompt being different when tokenized on its own
+    # vs when done as prompt+answer.
+    response_token_ids_start_idx = [len(p) for p in prompt_input_ids]
+
+    # If tokenized prompt is different than both prompt+answer, then it means the
+    # last token has changed due to merging.
+    for idx, (p, f, r) in enumerate(
+        zip(prompt_input_ids, full_input_ids, response_token_ids_start_idx, strict=True)
+    ):
+        if not np.array_equal(p, f[:r]):
+            response_token_ids_start_idx[idx] -= 1
+
+    prompt_input_ids = [
+        f[:r] for f, r in zip(full_input_ids, response_token_ids_start_idx, strict=True)
+    ]
+    prompt_attention_mask = [
+        f[:r]
+        for f, r in zip(full_attention_mask, response_token_ids_start_idx, strict=True)
+    ]
+
+    for p, m in zip(prompt_input_ids, prompt_attention_mask, strict=True):
+        if len(p) != len(m):
+            raise ValueError(
+                "Prompt input ids and attention mask should have the same length."
+            )
+
+    answer_input_ids = [
+        f[r:] for f, r in zip(full_input_ids, response_token_ids_start_idx, strict=True)
+    ]
+    answer_attention_mask = [
+        f[r:]
+        for f, r in zip(full_attention_mask, response_token_ids_start_idx, strict=True)
+    ]
+
+    output = dict(
+        prompt_input_ids=prompt_input_ids,
+        prompt_attention_mask=prompt_attention_mask,
+        answer_input_ids=answer_input_ids,
+        answer_attention_mask=answer_attention_mask,
+    )
+
+    if embedding_tokenizer is not None:
+        embedding_tokenized = embedding_tokenizer(
+            batch["prompt"], truncation=True, add_special_tokens=False
+        )
+
+        output.update(
+            {
+                "embedding_input_ids": embedding_tokenized["input_ids"],
+                "embedding_attention_mask": embedding_tokenized["attention_mask"],
+            }
+        )
+
+    return output
+
+
+def _process_tokens(
+    example: dict[str, Any], model: "PreTrainedModel" = None, **kwargs
+) -> dict:
+    """Process tokens of a BCO specific dataset.
+
+    At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
+    completion responses is/are too long. First we truncate the prompt; if we're still too long, we truncate the
+    completion.
+
+    We also create the labels for the completion responses, which are of length equal to the sum of the length of the
+    prompt and the completion response, with label_pad_token_id for the prompt tokens.
+    """
+    prompt = example["prompt"]
+    completion = example["completion"]
+
+    batch = {
+        f"{kwargs['prefix']}prompt": prompt,
+        f"{kwargs['prefix']}completion": completion,
+        f"{kwargs['prefix']}label": example["label"],
+    }
+
+    if not kwargs["is_encoder_decoder"]:
+        # Check issues below for more details
+        #  1. https://github.com/huggingface/trl/issues/907
+        #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+        #  3. https://github.com/LianjiaTech/BELLE/issues/337
+
+        if not isinstance(prompt, str):
+            raise ValueError(f"prompt should be an str but got {type(prompt)}")
+
+        if not isinstance(completion, str):
+            raise ValueError(f"completion should be an str but got {type(completion)}")
+
+        # keys of format prompt_* refers to just the prompt and answer_* refers to just the answer
+        all_tokens = {
+            "prompt_input_ids": example["prompt_input_ids"],
+            "prompt_attention_mask": example["prompt_attention_mask"],
+            "answer_input_ids": example["answer_input_ids"],
+            "answer_attention_mask": example["answer_attention_mask"],
+        }
+
+        # calculate max length by checking if BOS/EOS is already there
+        max_length = kwargs["max_length"]
+        bos_token_id = kwargs["tokenizer"].bos_token_id
+        eos_token_id = kwargs["tokenizer"].eos_token_id
+        if bos_token_id != all_tokens["prompt_input_ids"][0]:
+            max_length -= 1
+        if eos_token_id != all_tokens["answer_input_ids"][-1]:
+            max_length -= 1
+
+        # if combined sequence is too long (> max_length - 1 for BOS token - 1 for EOS), truncate the prompt
+        if (
+            len(all_tokens["prompt_input_ids"]) + len(all_tokens["answer_input_ids"])
+            > max_length
+        ):
+            for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                if kwargs["truncation_mode"] == "keep_start":
+                    all_tokens[k] = all_tokens[k][: kwargs["max_prompt_length"]]
+                elif kwargs["truncation_mode"] == "keep_end":
+                    all_tokens[k] = all_tokens[k][-kwargs["max_prompt_length"] :]
+                else:
+                    raise ValueError(
+                        f"Unknown truncation mode: {kwargs['truncation_mode']}"
+                    )
+
+        # if that's still too long, truncate the response
+        if (
+            len(all_tokens["prompt_input_ids"]) + len(all_tokens["answer_input_ids"])
+            > max_length
+        ):
+            for k in ["answer_input_ids", "answer_attention_mask"]:
+                all_tokens[k] = all_tokens[k][
+                    : max_length - kwargs["max_prompt_length"]
+                ]
+
+        # all input_ids and attention mask as is. We then check if we need to add BOS/EOS tokens
+        batch[f"{kwargs['prefix']}prompt_input_ids"] = all_tokens["prompt_input_ids"]
+        batch[f"{kwargs['prefix']}prompt_attention_mask"] = all_tokens[
+            "prompt_attention_mask"
+        ]
+        batch[f"{kwargs['prefix']}completion_input_ids"] = (
+            all_tokens["prompt_input_ids"] + all_tokens["answer_input_ids"]
+        )
+        batch[f"{kwargs['prefix']}completion_attention_mask"] = (
+            all_tokens["prompt_attention_mask"] + all_tokens["answer_attention_mask"]
+        )
+
+        # add BOS, which affects both prompt and the full completion
+        if bos_token_id is not None:
+            if (
+                len(all_tokens["prompt_input_ids"]) == 0
+                or bos_token_id != all_tokens["prompt_input_ids"][0]
+            ):
+                batch[f"{kwargs['prefix']}prompt_input_ids"] = [bos_token_id] + batch[
+                    f"{kwargs['prefix']}prompt_input_ids"
+                ]
+                batch[f"{kwargs['prefix']}prompt_attention_mask"] = [1] + batch[
+                    f"{kwargs['prefix']}prompt_attention_mask"
+                ]
+                batch[f"{kwargs['prefix']}completion_input_ids"] = [
+                    bos_token_id
+                ] + batch[f"{kwargs['prefix']}completion_input_ids"]
+                batch[f"{kwargs['prefix']}completion_attention_mask"] = [1] + batch[
+                    f"{kwargs['prefix']}completion_attention_mask"
+                ]
+        # add EOS, which affects only the full completion
+        if (
+            len(all_tokens["answer_input_ids"]) == 0
+            or eos_token_id != all_tokens["answer_input_ids"][-1]
+        ):
+            batch[f"{kwargs['prefix']}completion_input_ids"] = batch[
+                f"{kwargs['prefix']}completion_input_ids"
+            ] + [eos_token_id]
+            batch[f"{kwargs['prefix']}completion_attention_mask"] = batch[
+                f"{kwargs['prefix']}completion_attention_mask"
+            ] + [1]
+
+        batch[f"{kwargs['prefix']}completion_labels"] = batch[
+            f"{kwargs['prefix']}completion_input_ids"
+        ][:]
+        batch[f"{kwargs['prefix']}completion_labels"][
+            : len(batch[f"{kwargs['prefix']}prompt_input_ids"])
+        ] = [kwargs["label_pad_token_id"]] * len(
+            batch[f"{kwargs['prefix']}prompt_input_ids"]
+        )
+    else:
+        completion_tokens = kwargs["tokenizer"](
+            completion,
+            truncation=True,
+            max_length=kwargs["max_completion_length"],
+            add_special_tokens=True,
+        )
+        prompt_tokens = kwargs["tokenizer"](
+            prompt,
+            truncation=True,
+            max_length=kwargs["max_prompt_length"],
+            add_special_tokens=True,
+        )
+
+        batch[f"{kwargs['prefix']}prompt_input_ids"] = prompt_tokens["input_ids"]
+        batch[f"{kwargs['prefix']}prompt_attention_mask"] = prompt_tokens[
+            "attention_mask"
+        ]
+
+        batch[f"{kwargs['prefix']}completion_labels"] = completion_tokens["input_ids"]
+        batch[f"{kwargs['prefix']}completion_attention_mask"] = completion_tokens[
+            "attention_mask"
+        ]
+        if model is not None and hasattr(
+            model, "prepare_decoder_input_ids_from_labels"
+        ):
+            batch[f"{kwargs['prefix']}completion_decoder_input_ids"] = (
+                model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["completion_labels"])
+                )
+            )
+
+    return batch
+
+
+class BCOTrainer(BaseTrainer):
+    r"""
+    Initialize BCOTrainer from [BCO](https://huggingface.co/papers/2404.04656) paper.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an [`~transformers.AutoModelForSequenceClassification`].
+        ref_model ([`PreTrainedModelWrapper`]):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        args ([`BCOConfig`]):
+            The arguments to use for training.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        model_adapter_name (`str`, defaults to `None`):
+            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+        ref_adapter_name (`str`, defaults to `None`):
+            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+    """
+
+    _tag_names = ["trl", "bco"]
+    _name = "BCO"
+    _paper = {
+        "title": "Binary Classifier Optimization for Large Language Model Alignment",
+        "id": "2404.04656",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{jung2024binary,
+                title        = {{Binary Classifier Optimization for Large Language Model Alignment}},
+                author       = {Seungjae Jung and Gunsoo Han and Daniel Wontae Name and Kyoung{-}Woon On},
+                year         = 2024,
+                eprint       = {arXiv:2404.04656}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str = None,
+        ref_model: PreTrainedModel | nn.Module | str | None = None,
+        args: BCOConfig = None,
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        data_collator: DataCollator | None = None,
+        model_init: Callable[[], PreTrainedModel] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
+        model_adapter_name: str | None = None,
+        ref_adapter_name: str | None = None,
+        embedding_func: Callable | None = None,
+        embedding_tokenizer: PreTrainedTokenizerBase | None = None,
+    ):
+        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+            warnings.warn(
+                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
+                "it and want it to remain, please share your comments here: "
+                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
+                "TRL_EXPERIMENTAL_SILENCE=1."
+            )
+        if embedding_func is not None and not (
+            is_sklearn_available() and is_joblib_available()
+        ):
+            raise ImportError(
+                "BCOTrainer with UDM requires the scikit-learn and joblib libraries. Please install it with `pip install scikit-learn joblib`."
+            )
+
+        if type(args) is TrainingArguments:
+            raise ValueError("Please use `BCOConfig` instead `TrainingArguments`.")
+
+        if not isinstance(model, str) and model is not None and ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must mass a copy of it, or `None` if you use peft."
+            )
+
+        if args.model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError(
+                "You passed model_kwargs to the BCOTrainer. But your model is already instantiated."
+            )
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            dtype = model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the BCOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                model_init_kwargs["dtype"] = dtype
+
+        if args.ref_model_init_kwargs is None:
+            ref_model_init_kwargs = {}
+        elif not isinstance(ref_model, str):
+            raise ValueError(
+                "You passed ref_model_kwargs to the BCOTrainer. But your ref_model is already instantiated."
+            )
+        else:
+            ref_model_init_kwargs = args.ref_model_init_kwargs
+            dtype = ref_model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the BCOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                ref_model_init_kwargs["dtype"] = dtype
+
+        if isinstance(model, str):
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        if isinstance(ref_model, str):
+            ref_model = AutoModelForCausalLM.from_pretrained(
+                ref_model, **ref_model_init_kwargs
+            )
+
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it with `pip install peft` to use the PEFT models"
+            )
+        if is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(
+                model, "is_loaded_in_4bit", False
+            ):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {
+                    "use_gradient_checkpointing": args.gradient_checkpointing
+                }
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = (
+                        args.gradient_checkpointing_kwargs
+                    )
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+
+                    model.get_input_embeddings().register_forward_hook(
+                        make_inputs_require_grad
+                    )
+
+            # get peft model with the given config
+            model = get_peft_model(model, peft_config)
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(
+                    make_inputs_require_grad
+                )
+
+        if args.generate_during_eval and not (
+            is_wandb_available() or is_comet_available()
+        ):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif args.is_encoder_decoder is None:
+            raise ValueError(
+                "When no model is provided, you need to pass the parameter is_encoder_decoder."
+            )
+        else:
+            self.is_encoder_decoder = args.is_encoder_decoder
+
+        self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
+        self.model_adapter_name = model_adapter_name
+        self.ref_adapter_name = ref_adapter_name
+
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model or args.precompute_ref_log_probs:
+            # The `model` with adapters turned off will be used as the reference model
+            self.ref_model = None
+        else:
+            self.ref_model = create_reference_model(model)
+
+        if processing_class is None:
+            raise ValueError(
+                "max_length or a processing_class must be specified when using the default DPODataCollatorWithPadding"
+            )
+        if args.max_length is None:
+            logger.warning(
+                "When using DPODataCollatorWithPadding, you should set `max_length` in the `BCOConfig`. "
+                "It will be set to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        if args.max_length is not None:
+            max_length = args.max_length
+
+        if args.max_prompt_length is None:
+            logger.warning(
+                "When using DPODataCollatorWithPadding, you should set `max_prompt_length` in the `BCOConfig`. "
+                "It will be set to `128` by default, but you should do it yourself in the future.",
+            )
+            max_prompt_length = 128
+        if args.max_prompt_length is not None:
+            max_prompt_length = args.max_prompt_length
+
+        max_completion_length = None
+        if args.max_completion_length is None and self.is_encoder_decoder:
+            logger.warning(
+                "When using DPODataCollatorWithPadding with an encoder decoder architecture, you should set `max_completion_length` in the BCOTrainer's init"
+                " it will be set to `128` by default, but you should do it yourself in the future.",
+            )
+            max_completion_length = 128
+        if args.max_completion_length is not None and self.is_encoder_decoder:
+            max_completion_length = args.max_completion_length
+
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+                label_pad_token_id=args.label_pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your BCOConfig"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.padding_value = (
+            args.padding_value
+            if args.padding_value is not None
+            else processing_class.pad_token_id
+        )
+        self.max_prompt_length = max_prompt_length
+        self.truncation_mode = args.truncation_mode
+        self.max_completion_length = max_completion_length
+        self.precompute_ref_log_probs = args.precompute_ref_log_probs
+
+        # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
+        # keep track of first called to avoid computation of future calls
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+
+        # metric
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # BCO parameter
+        self.beta = args.beta
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+
+        # Underlying Distribution Matching argument
+        self.embedding_func = embedding_func
+        self.embedding_tokenizer = embedding_tokenizer
+
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in BCO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids" and "completion_input_ids". As a result,
+        # the trainer issues the warning: "Could not estimate the number of tokens of the input, floating-point
+        # operations will not be computed." To suppress this warning, we set the "estimate_tokens" key in the model's
+        # "warnings_issued" dictionary to True. This acts as a flag to indicate that the warning has already been
+        # issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        with PartialState().main_process_first():
+            # Extract the prompt if needed
+            train_dataset = train_dataset.map(
+                maybe_extract_prompt,
+                num_proc=args.dataset_num_proc,
+                desc="Extracting prompt from train dataset",
+            )
+            # Unpair the dataset if needed
+            train_dataset = maybe_unpair_preference_dataset(
+                train_dataset, args.dataset_num_proc, desc="Unpairing train dataset"
+            )
+            # Apply the chat template if needed
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template,
+                fn_kwargs={"tokenizer": processing_class},
+                num_proc=args.dataset_num_proc,
+            )
+            if eval_dataset is not None:
+                # Extract the prompt if needed
+                eval_dataset = eval_dataset.map(
+                    maybe_extract_prompt,
+                    num_proc=args.dataset_num_proc,
+                    desc="Extracting prompt from eval dataset",
+                )
+                # Unpair the dataset if needed
+                eval_dataset = maybe_unpair_preference_dataset(
+                    eval_dataset, args.dataset_num_proc, desc="Unpairing eval dataset"
+                )
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                )
+
+            # Tokenize and prepare the training datasets
+            train_dataset = train_dataset.map(
+                _tokenize,
+                batched=True,
+                fn_kwargs={
+                    "tokenizer": processing_class,
+                    "embedding_tokenizer": self.embedding_tokenizer,
+                },
+                num_proc=args.dataset_num_proc,
+                desc="Tokenizing train dataset",
+            )
+
+            # Prepare the datasets
+            fn_kwargs = {
+                "prefix": "",
+                "is_encoder_decoder": self.is_encoder_decoder,
+                "tokenizer": processing_class,
+                "max_length": self.max_length,
+                "truncation_mode": self.truncation_mode,
+                "label_pad_token_id": self.label_pad_token_id,
+                "max_prompt_length": self.max_prompt_length,
+                "max_completion_length": self.max_completion_length,
+            }
+            train_dataset = train_dataset.map(
+                _process_tokens,
+                fn_kwargs=fn_kwargs,
+                num_proc=args.dataset_num_proc,
+                desc="Processing tokenized train dataset",
+            )
+
+            if eval_dataset is not None:
+                # Tokenize
+                eval_dataset = eval_dataset.map(
+                    _tokenize,
+                    fn_kwargs={
+                        "tokenizer": processing_class,
+                        "embedding_tokenizer": self.embedding_tokenizer,
+                    },
+                    batched=True,
+                    num_proc=args.dataset_num_proc,
+                    desc="Tokenizing eval dataset",
+                )
+
+                # Process
+                fn_kwargs = {
+                    "prefix": "",
+                    "is_encoder_decoder": self.is_encoder_decoder,
+                    "tokenizer": processing_class,
+                    "max_length": self.max_length,
+                    "truncation_mode": self.truncation_mode,
+                    "label_pad_token_id": self.label_pad_token_id,
+                    "max_prompt_length": self.max_prompt_length,
+                    "max_completion_length": self.max_completion_length,
+                }
+                eval_dataset = eval_dataset.map(
+                    _process_tokens,
+                    fn_kwargs=fn_kwargs,
+                    num_proc=args.dataset_num_proc,
+                    desc="Processing tokenized eval dataset",
+                )
+
+            desirable = train_dataset.filter(
+                lambda x: x["label"],
+                num_proc=args.dataset_num_proc,
+                desc="Filtering desirable examples",
+            )
+            undesirable = train_dataset.filter(
+                lambda x: not x["label"],
+                num_proc=args.dataset_num_proc,
+                desc="Filtering undesirable examples",
+            )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+        # Deepspeed Zero-3 does not support precompute_ref_log_probs
+        if self.is_deepspeed_enabled:
+            if (
+                self.accelerator.state.deepspeed_plugin.zero_stage == 3
+                and self.precompute_ref_log_probs
+            ):
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`."
+                )
+
+        if self.ref_model is None:
+            if not (self.is_peft_model or self.precompute_ref_log_probs):
+                raise ValueError(
+                    "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`"
+                )
+        elif self.is_deepspeed_enabled:
+            self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+        else:
+            self.ref_model = self.accelerator.prepare_model(
+                self.ref_model, evaluation_mode=True
+            )
+
+        self.running = RunningMoments(accelerator=self.accelerator)
+
+        if self.embedding_func is None or args.resume_from_checkpoint:
+            return
+
+        chosen_embeddings = self._get_sample_prompt_embeddings(
+            desirable, sample_size=self.args.prompt_sample_size
+        )
+        rejected_embeddings = self._get_sample_prompt_embeddings(
+            undesirable, sample_size=self.args.prompt_sample_size
+        )
+
+        embeddings = torch.cat((chosen_embeddings, rejected_embeddings), dim=0)
+        labels = torch.cat(
+            (
+                torch.ones_like(chosen_embeddings[:, 0]),
+                torch.zeros_like(rejected_embeddings[:, 0]),
+            ),
+            dim=0,
+        )
+
+        self.clf = LogisticRegression(class_weight="balanced").fit(
+            embeddings.cpu().float().numpy(), labels.cpu().numpy()
+        )
+        chosen_mean = self.clf.score(
+            chosen_embeddings.cpu().float().numpy(),
+            torch.ones_like(chosen_embeddings[:, 0]).cpu().numpy(),
+        )
+        rejected_mean = self.clf.score(
+            rejected_embeddings.cpu().float().numpy(),
+            torch.zeros_like(rejected_embeddings[:, 0]).cpu().numpy(),
+        )
+        logger.info(
+            f"UDM classifier training scores: chosen: {chosen_mean}, rejected: {rejected_mean}"
+        )
+
+    @property
+    def match_underlying_distribution(self):
+        return self.embedding_func is not None and self.embedding_tokenizer is not None
+
+    def _get_chosen_prob(
+        self, prompt_embeddings: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        """
+        Calculates the probability if the given prompt embedding is from desirable dataset. This function calculates
+        the probability in the process and ensemble across processes.
+        """
+        dtype = prompt_embeddings.dtype
+        device = prompt_embeddings.device
+        rank = self.accelerator.process_index
+
+        padded_prompt_embeddings = self.accelerator.pad_across_processes(
+            prompt_embeddings, pad_index=self.embedding_tokenizer.pad_token_id
+        )
+        sample_size = padded_prompt_embeddings.shape[0]
+        nonzero = (
+            padded_prompt_embeddings.mean(dim=1)
+            != self.embedding_tokenizer.pad_token_id
+        )
+        prompt_embeddings = self.accelerator.gather(padded_prompt_embeddings)
+
+        # cannot predict for all empty values
+        if prompt_embeddings.shape[0] == 0:
+            return torch.tensor([], device=device, dtype=dtype)
+
+        prob = self.clf.predict_proba(prompt_embeddings.cpu().float().numpy())[:, 1]
+        prob = torch.as_tensor(prob, dtype=dtype, device=device)
+        prob = self.accelerator.reduce(prob, reduction="mean")
+
+        prob = prob[sample_size * rank : sample_size * (rank + 1)]
+        prob = prob[nonzero]
+
+        return prob
+
+    def _vectorize_prompt(
+        self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor
+    ) -> torch.FloatTensor:
+        """
+        Replaces processing_class.pad_token_id to embedding_tokenizer.pad_token_id and applies self.embedding_func
+        """
+        input_ids = torch.where(
+            input_ids == self.processing_class.pad_token_id,
+            self.embedding_tokenizer.pad_token_id,
+            input_ids,
+        )
+
+        with torch.no_grad():
+            embeddings = self.embedding_func(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )
+
+        return embeddings
+
+    def _get_prompt_embeddings(
+        self, batch: dict[str, list | torch.LongTensor]
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
+        """Extract embeddings from frozen embedding model"""
+        if not self.match_underlying_distribution:
+            return None, None
+
+        embeddings = self._vectorize_prompt(
+            input_ids=batch["embedding_input_ids"],
+            attention_mask=batch["embedding_attention_mask"],
+        )
+
+        labels = torch.tensor(
+            batch["label"], dtype=torch.bool, device=embeddings.device
+        )
+        chosen_idx = torch.where(labels)[0]
+        rejected_idx = torch.where(~labels)[0]
+
+        chosen_embeddings = embeddings[chosen_idx, ...]
+        rejected_embeddings = embeddings[rejected_idx, ...]
+
+        return (chosen_embeddings, rejected_embeddings)
+
+    def _get_sample_prompt_embeddings(
+        self, dataset: Dataset, sample_size: int = 512
+    ) -> torch.FloatTensor:
+        """
+        Sample instances from dataset and get prompt embeddings. Used for density ratio classifier training.
+        """
+        n_samples = min(len(dataset), sample_size)
+        rand_indices = np.random.choice(len(dataset), size=(n_samples,))
+
+        embedding_dataset = dataset.select(rand_indices)
+
+        dataloader_params = {
+            "batch_size": self.args.per_device_train_batch_size,
+            "collate_fn": self.data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "shuffle": False,
+        }
+
+        # prepare dataloader
+        data_loader = self.accelerator.prepare(
+            DataLoader(embedding_dataset, **dataloader_params)
+        )
+
+        with torch.no_grad():
+            all_embeddings = torch.empty(0)
+            for padded_batch in tqdm(
+                iterable=data_loader, desc="Building sample prompt embeddings"
+            ):
+                embeddings = self._vectorize_prompt(
+                    input_ids=padded_batch["embedding_input_ids"],
+                    attention_mask=padded_batch["embedding_attention_mask"],
+                )
+                embeddings = self.accelerator.gather_for_metrics(embeddings)
+                all_embeddings = torch.cat((all_embeddings, embeddings.cpu()))
+
+        return all_embeddings
+
+    def _save_optimizer_and_scheduler(self, output_dir):
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        super()._save_optimizer_and_scheduler(output_dir)
+
+        if self.accelerator.is_main_process:
+            # When saving optimizer and scheduler to checkpoint, save also the running delta object.
+            self.running.save_to_json(os.path.join(output_dir, RUNNING_NAME))
+
+            if self.match_underlying_distribution:
+                joblib.dump(self.clf, os.path.join(output_dir, CLF_NAME), compress=True)
+
+    def _load_optimizer_and_scheduler(self, checkpoint):
+        if checkpoint is None:
+            logger.warning_once(f"Missing Checkpoint {checkpoint}")
+            return
+
+        super()._load_optimizer_and_scheduler(checkpoint)
+
+        # when loading optimizer and scheduler from checkpoint, also load the running delta object.
+        running_file = os.path.join(checkpoint, RUNNING_NAME)
+        if os.path.isfile(running_file):
+            self.running = RunningMoments.load_from_json(self.accelerator, running_file)
+
+        if self.match_underlying_distribution:
+            clf_file = os.path.join(checkpoint, CLF_NAME)
+            if os.path.isfile(clf_file):
+                self.clf = joblib.load(clf_file)
+
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with (
+            self.accelerator.unwrap_model(self.model).disable_adapter()
+            if self.is_peft_model and not self.ref_adapter_name
+            else nullcontext()
+        ):
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.model_adapter_name or "default")
+
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`.
+        """
+        if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_train_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(
+                DataLoader(self.train_dataset, **dataloader_params)
+            )
+            reference_completion_logps = []
+
+            for padded_batch in tqdm(
+                iterable=data_loader, desc="Train dataset reference log probs"
+            ):
+                reference_completion_logp = self.compute_reference_log_probs(
+                    padded_batch
+                )
+
+                reference_completion_logp = self.accelerator.gather_for_metrics(
+                    reference_completion_logp
+                )
+                reference_completion_logps.append(reference_completion_logp.cpu())
+
+            self.train_dataset = self.train_dataset.add_column(
+                name="reference_logps",
+                column=torch.cat(reference_completion_logps).float().numpy(),
+            )
+
+            self._precomputed_train_ref_log_probs = True
+
+        return super().get_train_dataloader()
+
+    def get_eval_dataloader(self, eval_dataset: Dataset | None = None) -> DataLoader:
+        """
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`.
+
+        Args:
+            eval_dataset (`torch.utils.data.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
+                by the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
+        if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_eval_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(
+                DataLoader(eval_dataset, **dataloader_params)
+            )
+
+            reference_completion_logps = []
+
+            for padded_batch in tqdm(
+                iterable=data_loader, desc="Eval dataset reference log probs"
+            ):
+                reference_completion_logp = self.compute_reference_log_probs(
+                    padded_batch
+                )
+
+                reference_completion_logp = self.accelerator.gather_for_metrics(
+                    reference_completion_logp
+                )
+                reference_completion_logps.append(reference_completion_logp.cpu())
+
+            eval_dataset = eval_dataset.add_column(
+                name="reference_logps",
+                column=torch.cat(reference_completion_logps).float().numpy(),
+            )
+
+            # Save calculated reference_chosen_logps and reference_rejected_logps to the eval_dataset for subsequent runs
+            if self.eval_dataset is not None:
+                self.eval_dataset = eval_dataset
+            self._precomputed_eval_ref_log_probs = True
+
+        return super().get_eval_dataloader(eval_dataset=eval_dataset)
+
+    def compute_reference_log_probs(self, padded_batch: dict) -> dict:
+        """Computes log probabilities of the reference model for a single padded batch of a BCO specific dataset."""
+        with torch.no_grad():
+            if self.ref_model is None:
+                with self.null_ref_context():
+                    if self.is_encoder_decoder:
+                        completion_logits = self.model(
+                            padded_batch["prompt_input_ids"],
+                            attention_mask=padded_batch["prompt_attention_mask"],
+                            decoder_input_ids=padded_batch.get(
+                                "completion_decoder_input_ids"
+                            ),
+                            labels=padded_batch["completion_labels"],
+                        ).logits
+
+                    else:
+                        completion_logits = self.model(
+                            padded_batch["completion_input_ids"],
+                            attention_mask=padded_batch["completion_attention_mask"],
+                        ).logits
+
+            elif self.is_encoder_decoder:
+                completion_logits = self.ref_model(
+                    padded_batch["prompt_input_ids"],
+                    attention_mask=padded_batch["prompt_attention_mask"],
+                    decoder_input_ids=padded_batch.get("completion_decoder_input_ids"),
+                    labels=padded_batch["completion_labels"],
+                ).logits
+
+            else:
+                completion_logits = self.ref_model(
+                    padded_batch["completion_input_ids"],
+                    attention_mask=padded_batch["completion_attention_mask"],
+                ).logits
+
+        completion_logps = self.get_batch_logps(
+            completion_logits,
+            padded_batch["completion_labels"],
+            average_log_prob=False,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        return completion_logps
+
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        label_pad_token_id: int = -100,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
+                ignored. Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+            label_pad_token_id:
+                The label value to ignore when computing log probabilities.
+            is_encoder_decoder:
+                Whether the model is an encoder-decoder model. If True, the labels are not shifted, and the logits are
+                assumed to already be aligned with the labels. If False, the labels are shifted to the right by one
+                position, and the logits are assumed to be aligned with the shifted labels.
+
+        Returns
+        -------
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError(
+                "Logits (batch and sequence length dim) and labels must have the same shape."
+            )
+
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        else:
+            # Fixes end-dec RuntimeError
+            labels = labels.clone()
+
+        loss_mask = labels != label_pad_token_id
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == label_pad_token_id] = 0
+
+        per_token_logps = selective_log_softmax(logits, labels)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        return (per_token_logps * loss_mask).sum(-1)
+
+    def forward(
+        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
+    ) -> tuple[
+        torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor
+    ]:
+        model_kwargs = (
+            {
+                "labels": batch["completion_labels"],
+                "decoder_input_ids": batch.get("completion_decoder_input_ids"),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        outputs = model(
+            batch["completion_input_ids"],
+            attention_mask=batch["completion_attention_mask"],
+            **model_kwargs,
+        )
+        completion_logits = outputs.logits
+
+        completion_logps = self.get_batch_logps(
+            completion_logits,
+            batch["completion_labels"],
+            average_log_prob=False,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        if completion_logps.shape[0] != len(batch["label"]):
+            raise ValueError(
+                "There is a mismatch between the number of examples in this batch and the number of "
+                "examples for which an output sequence was predicted."
+            )
+
+        chosen_idx = [
+            i for i in range(completion_logps.shape[0]) if batch["label"][i] is True
+        ]
+        rejected_idx = [
+            i for i in range(completion_logps.shape[0]) if batch["label"][i] is False
+        ]
+
+        chosen_logps = completion_logps[chosen_idx, ...]
+        rejected_logps = completion_logps[rejected_idx, ...]
+
+        chosen_logits = completion_logits[chosen_idx, ...]
+        rejected_logits = completion_logits[rejected_idx, ...]
+
+        if self.aux_loss_enabled:
+            return (
+                chosen_logps,
+                rejected_logps,
+                chosen_logits,
+                rejected_logits,
+                outputs.aux_loss,
+            )
+        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits)
+
+    def _get_udm_weight(
+        self, rejected_embeddings: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        prob_desirable = self._get_chosen_prob(rejected_embeddings)
+        min_ratio = self.args.min_density_ratio
+        max_ratio = self.args.max_density_ratio
+
+        weight = (prob_desirable / (1 - prob_desirable + 1e-8)).clamp(
+            min=min_ratio, max=max_ratio
+        )
+
+        return weight
+
+    def bco_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        reference_chosen_logps: torch.FloatTensor,
+        reference_rejected_logps: torch.FloatTensor,
+        chosen_embeddings: torch.FloatTensor | None,
+        rejected_embeddings: torch.FloatTensor | None,
+        do_train: bool = True,
+    ) -> tuple[
+        torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor
+    ]:
+        """Compute the BCO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (num(chosen) in batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (num(rejected) in batch_size,)
+            reference_chosen_logps:
+                Log probabilities of the reference model for the chosen responses. Shape: (num(chosen) in batch_size,)
+            reference_rejected_logps:
+                Log probabilities of the reference model for the rejected responses. Shape: (num(rejected) in
+                batch_size,)
+            chosen_embeddings: embeddings of desirable prompts
+            rejected_embeddings: embeddings of undesirable prompts
+            do_train: whether to update the running delta value. Default is True.
+
+        Returns
+        -------
+            A tuple of four tensors: (losses, chosen_rewards, rejected_rewards, delta). The losses tensor contains the
+            BCO loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards
+            for the chosen and rejected responses, respectively. The delta value contains the moving average of all
+            implicit rewards.
+        """
+        chosen_logratios = policy_chosen_logps - reference_chosen_logps
+        chosen_rewards = self.beta * chosen_logratios
+
+        rejected_logratios = policy_rejected_logps - reference_rejected_logps
+        rejected_rewards = self.beta * rejected_logratios
+
+        if do_train:
+            self.running.update(
+                torch.cat((chosen_rewards, rejected_rewards), 0).detach()
+            )
+        delta = torch.as_tensor(self.running.mean, device=chosen_rewards.device)
+
+        chosen_losses = -F.logsigmoid(chosen_rewards - delta)
+        rejected_losses = -F.logsigmoid(-(rejected_rewards - delta))
+
+        if self.match_underlying_distribution:
+            chosen_weight = torch.ones_like(chosen_losses)
+            rejected_weight = self._get_udm_weight(rejected_embeddings)
+
+            losses = torch.cat(
+                (chosen_weight * chosen_losses, rejected_weight * rejected_losses),
+                dim=0,
+            )
+        else:
+            losses = torch.cat((chosen_losses, rejected_losses), dim=0)
+
+        return losses, chosen_rewards, rejected_rewards, delta
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, list | torch.LongTensor],
+        do_train: bool = True,
+    ):
+        """Compute the BCO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        batch = {
+            k: (v.to(self.accelerator.device) if isinstance(v, torch.Tensor) else v)
+            for k, v in batch.items()
+        }
+
+        forward_output = self.forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+        ) = forward_output[:4]
+        if self.aux_loss_enabled:
+            aux_loss = forward_output[4]
+
+        # if reference_logps in batch use them, otherwise use the reference model
+        if "reference_logps" in batch:
+            chosen_idx = [
+                i
+                for i in range(batch["reference_logps"].shape[0])
+                if batch["label"][i] is True
+            ]
+            rejected_idx = [
+                i
+                for i in range(batch["reference_logps"].shape[0])
+                if batch["label"][i] is False
+            ]
+
+            reference_chosen_logps = batch["reference_logps"][chosen_idx, ...]
+            reference_rejected_logps = batch["reference_logps"][rejected_idx, ...]
+        else:
+            with torch.no_grad():
+                if self.ref_model is None:
+                    with self.null_ref_context():
+                        (
+                            reference_chosen_logps,
+                            reference_rejected_logps,
+                            _,
+                            _,
+                        ) = self.forward(self.model, batch)[:4]
+                else:
+                    (
+                        reference_chosen_logps,
+                        reference_rejected_logps,
+                        _,
+                        _,
+                    ) = self.forward(self.ref_model, batch)[:4]
+
+        chosen_embeddings, rejected_embeddings = self._get_prompt_embeddings(batch)
+
+        losses, chosen_rewards, rejected_rewards, delta = self.bco_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+            reference_chosen_logps,
+            reference_rejected_logps,
+            chosen_embeddings,
+            rejected_embeddings,
+            do_train=do_train,
+        )
+        metrics["delta"] = self.accelerator.gather_for_metrics(delta).mean().item()
+
+        num_chosen = torch.Tensor([len(chosen_rewards)]).to(self.accelerator.device)
+        num_rejected = torch.Tensor([len(rejected_rewards)]).to(self.accelerator.device)
+
+        all_num_chosen = self.accelerator.gather_for_metrics(num_chosen).sum().item()
+        all_num_rejected = (
+            self.accelerator.gather_for_metrics(num_rejected).sum().item()
+        )
+
+        if all_num_chosen > 0:
+            metrics["rewards/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(chosen_rewards.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["logps/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(policy_chosen_logps.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["logits/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(policy_chosen_logits.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["count/chosen"] = all_num_chosen
+
+        if all_num_rejected > 0:
+            metrics["rewards/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(rejected_rewards.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["logps/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(policy_rejected_logps.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["logits/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(policy_rejected_logits.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["count/rejected"] = all_num_rejected
+
+        loss = losses.nanmean()
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+    def compute_loss(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs)
+
+        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
+        loss = loss.to(self.args.device)
+        # force log the metrics
+        if self.accelerator.is_main_process:
+            self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+
+    def store_metrics(
+        self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train"
+    ) -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def _get_train_sampler(
+        self, dataset: Dataset | None = None
+    ) -> torch.utils.data.Sampler | None:
+        if dataset is None:
+            dataset = self.train_dataset
+        if dataset is None or not has_length(dataset):
+            return None
+        return SequentialSampler(dataset)
+
+    def generate_from_model_and_ref(
+        self, model, batch: dict[str, torch.LongTensor]
+    ) -> tuple[str, str]:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+
+            # if reference_output in batch use that otherwise use the reference model
+            if "reference_output" in batch:
+                reference_output = batch["reference_output"]
+            elif self.ref_model is None:
+                with self.null_ref_context():
+                    reference_output = self.model.generate(
+                        input_ids=batch["prompt_input_ids"],
+                        attention_mask=batch["prompt_attention_mask"],
+                        max_length=self.max_length,
+                        do_sample=True,
+                        pad_token_id=self.processing_class.pad_token_id,
+                    )
+            else:
+                reference_output = self.ref_model.generate(
+                    input_ids=batch["prompt_input_ids"],
+                    attention_mask=batch["prompt_attention_mask"],
+                    max_length=self.max_length,
+                    do_sample=True,
+                    pad_token_id=self.processing_class.pad_token_id,
+                )
+
+        policy_output = pad_to_length(
+            policy_output, self.max_length, self.processing_class.pad_token_id
+        )
+        policy_output_decoded = self.processing_class.batch_decode(
+            policy_output, skip_special_tokens=True
+        )
+
+        reference_output = pad_to_length(
+            reference_output, self.max_length, self.processing_class.pad_token_id
+        )
+        reference_output_decoded = self.processing_class.batch_decode(
+            reference_output, skip_special_tokens=True
+        )
+
+        return policy_output_decoded, reference_output_decoded
+
+    def prediction_step(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        prediction_loss_only: bool,
+        ignore_keys: list[str] | None = None,
+    ):
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs, do_train=False)
+
+        # force log the metrics
+        if self.accelerator.is_main_process:
+            self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {}
+        if "logits/chosen_sum" in metrics:
+            logits_dict["eval_logits/chosen"] = metrics["logits/chosen_sum"]
+        if "logits/rejected_sum" in metrics:
+            logits_dict["eval_logits/rejected"] = metrics["logits/rejected_sum"]
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: bool | None = None,
+        ignore_keys: list[str] | None = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(
+                range(num_samples), k=self.args.eval_batch_size
+            )
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            target_labels = torch.tensor(
+                random_batch["label"], dtype=torch.bool, device=self.accelerator.device
+            )
+            target_indices = torch.where(~target_labels)[0]
+            target_batch = {
+                "prompt_input_ids": random_batch["prompt_input_ids"][target_indices],
+                "prompt_attention_mask": random_batch["prompt_attention_mask"][
+                    target_indices
+                ],
+                "prompt": itemgetter(*target_indices)(random_batch["prompt"]),
+            }
+            policy_output_decoded, ref_output_decoded = (
+                self.generate_from_model_and_ref(self.model, target_batch)
+            )
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy", "Ref Model"],
+                data=[
+                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
+                    for prompt, pol, ref in zip(
+                        target_batch["prompt"],
+                        policy_output_decoded,
+                        ref_output_decoded,
+                        strict=True,
+                    )
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader,
+            description,
+            prediction_loss_only,
+            ignore_keys,
+            metric_key_prefix,
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float`, *optional*):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # train metrics should have no prefix, eval should have 'eval_'
+        prefix = "eval_" if train_eval == "eval" else ""
+        # accumulate average metrics from sums and lengths
+        for split in ["chosen", "rejected"]:
+            if f"count/{split}" in self._stored_metrics[train_eval]:
+                count_sum = (
+                    torch.Tensor(self._stored_metrics[train_eval][f"count/{split}"])
+                    .sum()
+                    .item()
+                )
+                for metric in ["rewards", "logps", "logits"]:
+                    logs[f"{prefix}{metric}/{split}"] = (
+                        torch.Tensor(
+                            self._stored_metrics[train_eval][f"{metric}/{split}_sum"]
+                        )
+                        .sum()
+                        .item()
+                        / count_sum
+                    )
+                    # delete obsolete metric
+                    del self._stored_metrics[train_eval][f"{metric}/{split}_sum"]
+                del self._stored_metrics[train_eval][f"count/{split}"]
+        # calculate reward margin
+        if f"{prefix}rewards/chosen" in logs and f"{prefix}rewards/rejected" in logs:
+            logs[f"{prefix}rewards/margins"] = (
+                logs[f"{prefix}rewards/chosen"] - logs[f"{prefix}rewards/rejected"]
+            )
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[f"{prefix}{key}"] = torch.Tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/experimental/bema_for_ref_model/__init__.py b/src/aixpert/training/training/trl/experimental/bema_for_ref_model/__init__.py
new file mode 100644
index 0000000..5dd1040
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/bema_for_ref_model/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .callback import BEMACallback
+from .dpo_trainer import DPOTrainer
diff --git a/src/aixpert/training/training/trl/experimental/bema_for_ref_model/callback.py b/src/aixpert/training/training/trl/experimental/bema_for_ref_model/callback.py
new file mode 100644
index 0000000..202b919
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/bema_for_ref_model/callback.py
@@ -0,0 +1,243 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+import torch
+from transformers import (
+    PreTrainedModel,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+from transformers.trainer_callback import CallbackHandler
+
+from ...trainer.callbacks import BEMACallback as _BEMACallback
+
+
+# Logger for module-level logging
+logger = logging.getLogger(__name__)
+
+
+class CallbackHandlerWithRefModel(CallbackHandler):
+    """
+    A [`~transformers.CallbackHandler`] that supports passing a reference model to callbacks.
+    """
+
+    def __init__(
+        self, callbacks, model, ref_model, processing_class, optimizer, lr_scheduler
+    ):
+        super().__init__(callbacks, model, processing_class, optimizer, lr_scheduler)
+        self.ref_model = ref_model
+
+    # Copied from CallbackHandler.call_event with the addition of `ref_model` to the callback call.
+    def call_event(self, event, args, state, control, **kwargs):
+        for callback in self.callbacks:
+            result = getattr(callback, event)(
+                args,
+                state,
+                control,
+                model=self.model,
+                ref_model=self.ref_model,  # <- Added ref_model to the callback call
+                processing_class=self.processing_class,
+                optimizer=self.optimizer,
+                lr_scheduler=self.lr_scheduler,
+                train_dataloader=self.train_dataloader,
+                eval_dataloader=self.eval_dataloader,
+                **kwargs,
+            )
+            # A Callback can skip the return of `control` if it doesn't change it.
+            if result is not None:
+                control = result
+        return control
+
+
+class BEMACallback(_BEMACallback):
+    # docstyle-ignore
+    r"""
+    A [`~transformers.TrainerCallback`] that implements [BEMA](https://huggingface.co/papers/2508.00180)
+    (Bias-Corrected Exponential Moving Average) by [Adam Block](https://huggingface.co/abblock) and [Cyril
+    Zhang](https://huggingface.co/cyrilzhang). Code from https://github.com/abblock/bema under MIT license.
+
+    BEMA computes model weights that scale like:
+
+    $$
+    \theta_t' = \alpha_t \cdot (\theta_t - \theta_0) + \text{EMA}_t
+    $$
+
+    where  \\( \theta_t \\) is the current model weights,  \\( \theta_0 \\) is a snapshot of the model weights at the
+    first `update_after` step,  \\( \text{EMA}_t  \\) is the exponential moving average of the model weights, and
+     \\( \alpha_t \\) is a scaling factor that decays with the number of steps  \\( t \\) as
+
+    $$
+    \alpha_t = (\rho + \gamma \cdot t)^{-\eta}.
+    $$
+
+    The EMA is computed as:
+
+    $$
+    \text{EMA}_t = (1 - \beta_t) \cdot \text{EMA}_{t-1} + \beta_t \cdot \theta_t
+    $$
+
+    where  \\( \beta_t \\) is a decay factor that decays with the number of steps  \\( t \\) as
+
+    $$
+    \beta_t = (\rho + \gamma \cdot t)^{-\kappa}.
+    $$
+
+    Args:
+        update_freq (`int`, *optional*, defaults to `400`):
+            Update the BEMA weights every X steps. Denoted this as  \\( \phi \\) in the paper.
+        ema_power (`float`, *optional*, defaults to `0.5`):
+            Power for the EMA decay factor. Denoted  \\( \kappa \\) in the paper. To disable EMA, set this to `0.0`.
+        bias_power (`float`, *optional*, defaults to `0.2`):
+            Power for the BEMA scaling factor. Denoted  \\( \eta \\) in the paper. To disable BEMA, set this to `0.0`.
+        lag (`int`, *optional*, defaults to `10`):
+            Initial offset in the weight decay schedule that controls early-stage smoothness by acting as a virtual
+            starting age for the updates. Denoted as  \\( \rho \\) in the paper.
+        update_after (`int`, *optional*, defaults to `0`):
+            Burn-in time before starting to update the BEMA weights. Denoted  \\( \tau \\) in the paper.
+        multiplier (`float`, *optional*, defaults to `1.0`):
+            Initial value for the EMA decay factor. Denoted as  \\( \gamma \\) in the paper.
+        min_ema_multiplier (`float`, *optional*, defaults to `0.0`):
+            Minimum value for the EMA decay factor.
+        device (`str`, *optional*, defaults to `"cpu"`):
+            Device to use for the BEMA buffers, e.g. `"cpu"` or `"cuda"`. Note that in most cases, this device SHOULD
+            BE DIFFERENT from the device used for training in order to avoid OOM.
+        update_ref_model (`bool`, *optional*, defaults to `False`):
+            Whether to update the reference model with BEMA weights. This creates a lagged, smoothed version of the
+            main model as the reference model.
+        ref_model_update_freq (`int`, *optional*, defaults to `400`):
+            Update the reference model with BEMA weights every this many steps.
+        ref_model_update_after (`int`, *optional*, defaults to `0`):
+            Number of steps to wait before starting to update the reference model.
+
+    Example:
+
+    ```python
+    from trl import BEMACallback
+
+    trainer = Trainer(..., callbacks=[BEMACallback()])
+    ```
+    """
+
+    def __init__(
+        self,
+        update_freq: int = 400,
+        ema_power: float = 0.5,
+        bias_power: float = 0.2,
+        lag: int = 10,
+        update_after: int = 0,
+        multiplier: float = 1.0,
+        min_ema_multiplier: float = 0.0,
+        device: str = "cpu",
+        update_ref_model: bool = False,
+        ref_model_update_freq: int = 400,
+        ref_model_update_after: int = 0,
+    ):
+        super().__init__(
+            update_freq,
+            ema_power,
+            bias_power,
+            lag,
+            update_after,
+            multiplier,
+            min_ema_multiplier,
+            device,
+        )
+        # Reference model update parameters
+        self.update_ref_model = update_ref_model
+        self.ref_model_update_freq = ref_model_update_freq
+        self.ref_model_update_after = ref_model_update_after
+
+    @torch.no_grad()
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: PreTrainedModel,
+        **kwargs,
+    ):
+        super().on_step_end(args, state, control, model, **kwargs)
+
+        step = state.global_step
+        # Update reference model if enabled
+        if (
+            self.update_ref_model
+            and step >= self.ref_model_update_after
+            and (step - self.ref_model_update_after) % self.ref_model_update_freq == 0
+        ):
+            if "ref_model" not in kwargs:
+                raise ValueError("'ref_model' not found in kwargs.")
+
+            ref_model = kwargs["ref_model"]
+
+            # Get the current BEMA state dict
+            bema_state_dict = self.running_model.state_dict()
+
+            # Handle the case where ref_model is None (PEFT case)
+            if ref_model is None:
+                # In PEFT case, ref_model is None and we need to update the base model of the main model
+                main_model = self._unwrap_model(model)
+                if hasattr(main_model, "get_base_model"):
+                    # This is a PEFT model, update the base model
+                    base_model = main_model.get_base_model()
+                    self._update_model_with_bema_weights(
+                        base_model, bema_state_dict, is_peft_base=True
+                    )
+                else:
+                    # Regular model, update directly
+                    self._update_model_with_bema_weights(
+                        main_model, bema_state_dict, is_peft_base=False
+                    )
+            else:
+                # ref_model is provided, unwrap it and update
+                ref_model = self._unwrap_model(ref_model)
+                if hasattr(ref_model, "get_base_model"):
+                    # This is a PEFT model, update the base model
+                    base_model = ref_model.get_base_model()
+                    self._update_model_with_bema_weights(
+                        base_model, bema_state_dict, is_peft_base=True
+                    )
+                else:
+                    # Regular model, update directly
+                    self._update_model_with_bema_weights(
+                        ref_model, bema_state_dict, is_peft_base=False
+                    )
+
+            logger.info("BEMACallback: Updated reference model with BEMA weights")
+
+    def _update_model_with_bema_weights(
+        self, model, bema_state_dict, is_peft_base=False
+    ):
+        """Helper method to update a model with BEMA weights, handling PEFT and distributed scenarios."""
+        if is_peft_base:
+            # For PEFT base models, filter out adapter parameters
+            filtered_state_dict = {}
+            for key, value in bema_state_dict.items():
+                # Skip adapter parameters
+                if not key.startswith("lora_") and not key.startswith("adapter_"):
+                    # Remove 'base_model.' prefix if it exists
+                    if key.startswith("base_model."):
+                        base_key = key[len("base_model.") :]
+                    else:
+                        base_key = key
+                    filtered_state_dict[base_key] = value
+
+            # Update the base model
+            model.load_state_dict(filtered_state_dict, strict=False)
+        else:
+            # Regular model, update directly
+            model.load_state_dict(bema_state_dict, strict=False)
diff --git a/src/aixpert/training/training/trl/experimental/bema_for_ref_model/dpo_trainer.py b/src/aixpert/training/training/trl/experimental/bema_for_ref_model/dpo_trainer.py
new file mode 100644
index 0000000..e219192
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/bema_for_ref_model/dpo_trainer.py
@@ -0,0 +1,30 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...trainer.dpo_trainer import DPOTrainer as _DPOTrainer
+from .callback import CallbackHandlerWithRefModel
+
+
+class DPOTrainer(_DPOTrainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Replace with a new one that calls the events with the reference model
+        self.callback_handler = CallbackHandlerWithRefModel(
+            self.callback_handler.callbacks,
+            self.model,
+            self.ref_model,
+            self.processing_class,
+            self.optimizer,
+            self.lr_scheduler,
+        )
diff --git a/src/aixpert/training/training/trl/experimental/gfpo/__init__.py b/src/aixpert/training/training/trl/experimental/gfpo/__init__.py
new file mode 100644
index 0000000..612a7a5
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/gfpo/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .gfpo_config import GFPOConfig
+from .gfpo_trainer import GFPOTrainer
diff --git a/src/aixpert/training/training/trl/experimental/gfpo/gfpo_config.py b/src/aixpert/training/training/trl/experimental/gfpo/gfpo_config.py
new file mode 100644
index 0000000..b9f736e
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/gfpo/gfpo_config.py
@@ -0,0 +1,38 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from ...trainer.grpo_config import GRPOConfig as _GRPOConfig
+
+
+@dataclass
+class GFPOConfig(_GRPOConfig):
+    num_remains_in_group: int | None = field(
+        default=None,
+        metadata={
+            "help": "number inputs remains after group filter function, `'num_remains_in_group'` must be >=2 if given."
+        },
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+
+        if (
+            self.num_remains_in_group is not None
+            and self.num_remains_in_group >= self.num_generations
+        ):
+            raise ValueError(
+                f"Number remains in Group {self.num_remains_in_group} must be less than num_generations : {self.num_generations}."
+            )
diff --git a/src/aixpert/training/training/trl/experimental/gfpo/gfpo_trainer.py b/src/aixpert/training/training/trl/experimental/gfpo/gfpo_trainer.py
new file mode 100644
index 0000000..8637214
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/gfpo/gfpo_trainer.py
@@ -0,0 +1,499 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from collections.abc import Callable
+from typing import Any
+
+import torch
+from accelerate.utils import gather_object
+
+from ...data_utils import (
+    apply_chat_template,
+    is_conversational,
+    prepare_multimodal_messages,
+)
+from ...trainer.grpo_trainer import GRPOTrainer as _GRPOTrainer
+from ...trainer.utils import nanmax, nanmin, nanstd, pad
+
+
+logger = logging.getLogger(__name__)
+
+GroupFilterFunc = Callable[[list[list[Any]], list[list[Any]]], list[list[float]]]
+
+
+class GFPOTrainer(_GRPOTrainer):
+    def __init__(
+        self,
+        model,
+        reward_funcs,
+        args=None,
+        train_dataset=None,
+        eval_dataset=None,
+        processing_class=None,
+        reward_processing_classes=None,
+        group_filter_func=None,
+        callbacks=None,
+        optimizers=(None, None),
+        peft_config=None,
+    ):
+        super().__init__(
+            model=model,
+            reward_funcs=reward_funcs,
+            args=args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            reward_processing_classes=reward_processing_classes,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            peft_config=peft_config,
+        )
+        self.group_filter_func = group_filter_func
+        self.num_remains_in_group = args.num_remains_in_group
+        if self.group_filter_func is None and self.num_remains_in_group is not None:
+            raise ValueError(
+                f"Group filter function must not be None when num_remains_in_group ({self.num_remains_in_group}) is given."
+            )
+        if self.group_filter_func is not None and self.num_remains_in_group is None:
+            logger.warning(
+                "Group filter function is not activated since num_remains_in_group is not set"
+            )
+
+    def _generate_and_score_completions(self, inputs):
+        device = self.accelerator.device
+        mode = "train" if self.model.training else "eval"
+
+        prompts = [x["prompt"] for x in inputs]
+
+        if "images" in inputs[0]:
+            images = [example.get("images") for example in inputs]
+        elif "image" in inputs[0]:
+            images = [
+                [example.get("image")] if example.get("image") is not None else None
+                for example in inputs
+            ]
+        else:
+            images = None
+        # Transformers requires at least one image in the batch, otherwise it throws an error
+        if images is not None and all(img_list == [] for img_list in images):
+            images = None
+
+        # If the prompts are conversational and the inputs contain images, we need to convert the prompts from
+        # [{"role": "user", "content": "What color is the sky?"}] to
+        # [{"role": "user", "content": [{"type": "image", "image": <Image>}, {"type": "text", "text": "What color is the sky?"}]}]
+        if images is not None:
+            prompts = [
+                prepare_multimodal_messages(prompt, image_list)
+                for prompt, image_list in zip(prompts, images, strict=True)
+            ]
+
+        (
+            prompt_ids_list,
+            completion_ids_list,
+            num_items_in_batch,
+            sampling_per_token_logps_list,
+            extra_fields,
+        ) = self._generate(prompts)
+
+        # Convert lists of token IDs to padded tensors
+        prompt_ids = [torch.tensor(ids, device=device) for ids in prompt_ids_list]
+        prompt_mask = [torch.ones_like(ids, dtype=torch.long) for ids in prompt_ids]
+        prompt_ids = pad(
+            prompt_ids, padding_value=self.pad_token_id, padding_side="left"
+        )
+        prompt_mask = pad(prompt_mask, padding_value=0, padding_side="left")
+        completion_ids = [
+            torch.tensor(ids, device=device) for ids in completion_ids_list
+        ]
+        completion_mask = [
+            torch.ones_like(ids, dtype=torch.long) for ids in completion_ids
+        ]
+        completion_ids = pad(
+            completion_ids, padding_value=self.pad_token_id, padding_side="right"
+        )
+        completion_mask = pad(completion_mask, padding_value=0, padding_side="right")
+        if sampling_per_token_logps_list is not None:
+            sampling_per_token_logps = [
+                torch.tensor(logps, device=device)
+                for logps in sampling_per_token_logps_list
+            ]
+            sampling_per_token_logps = pad(
+                sampling_per_token_logps, padding_value=0.0, padding_side="right"
+            )
+        else:
+            sampling_per_token_logps = None
+
+        # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
+        if self.mask_truncated_completions:
+            eos_and_pad = [self.eos_token_id, self.pad_token_id]
+            is_truncated = torch.tensor(
+                [ids[-1] not in eos_and_pad for ids in completion_ids_list],
+                device=device,
+            )
+            completion_mask = completion_mask * (~is_truncated).unsqueeze(1).int()
+
+        # Concatenate prompt_mask with completion_mask for logit computation
+        prompt_completion_ids = torch.cat(
+            [prompt_ids, completion_ids], dim=1
+        )  # (B, P+C)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
+
+        logits_to_keep = completion_ids.size(
+            1
+        )  # we only need to compute the logits for the completion tokens
+        batch_size = (
+            self.args.per_device_train_batch_size
+            if mode == "train"
+            else self.args.per_device_eval_batch_size
+        )
+
+        num_images = (
+            [len(img_list) for img_list in images] if images is not None else None
+        )
+
+        # Get forward_kwargs for models with multimodal inputs
+        if images is not None:
+            prompts_text = [
+                apply_chat_template({"prompt": prompt}, self.processing_class)["prompt"]
+                for prompt in prompts
+            ]
+            prompt_inputs = self.processing_class(
+                images=images, text=prompts_text, padding=True, return_tensors="pt"
+            )
+            prompt_inputs = super()._prepare_inputs(prompt_inputs)
+            forward_kwargs = {
+                k: v
+                for k, v in prompt_inputs.items()
+                if k not in ["input_ids", "attention_mask"]
+            }
+        else:
+            forward_kwargs = {}
+
+        # If token_type_ids are used, extend them with zeros for the completion part
+        if "token_type_ids" in forward_kwargs:
+            token_type_ids = forward_kwargs["token_type_ids"]
+            forward_kwargs["token_type_ids"] = torch.cat(
+                [token_type_ids, token_type_ids.new_zeros(completion_ids.shape)], dim=1
+            )
+
+        with torch.no_grad():
+            # If the generation and optimization steps are misaligned—i.e., if generation does not occur at the end of
+            # a full optimizer step (when gradient_accumulation_steps is not a multiple of generate_every)—then the
+            # samples may come from an earlier version of the model. In that case, we need to track old_per_token_logps
+            # for importance sampling. If the steps are aligned, importance sampling isn't necessary and we set
+            # old_per_token_logps to None.
+            # When using vLLM, we always compute old_per_token_logps for importance sampling, it was shown that the
+            # distribution mismatch between vLLM and the training model can be large and harm the training.
+            generate_every = (
+                self.args.steps_per_generation * self.num_iterations
+            )  # generation frequency
+            if self.args.gradient_accumulation_steps % generate_every != 0 or (
+                self.use_vllm and self.vllm_importance_sampling_correction
+            ):
+                old_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                    self.model,
+                    prompt_completion_ids,
+                    attention_mask,
+                    logits_to_keep,
+                    batch_size,
+                    num_images=num_images,
+                    **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                )
+            else:
+                old_per_token_logps = None
+
+            # Compute the importance sampling ratio when using vLLM, to correct for potential distribution mismatch
+            if self.use_vllm and self.vllm_importance_sampling_correction:
+                importance_sampling_ratio = torch.exp(
+                    old_per_token_logps - sampling_per_token_logps
+                )
+                importance_sampling_ratio = torch.clamp(
+                    importance_sampling_ratio, max=self.vllm_importance_sampling_cap
+                )
+
+            # Compute the per-token log probabilities for the reference model
+            if self.beta != 0.0:
+                if self.ref_model is not None:
+                    ref_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                        self.ref_model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        logits_to_keep,
+                        batch_size=batch_size,
+                        num_images=num_images,
+                        **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                    )
+                else:
+                    with self.accelerator.unwrap_model(self.model).disable_adapter():
+                        ref_per_token_logps, _ = (
+                            self._get_per_token_logps_and_entropies(
+                                self.model,
+                                prompt_completion_ids,
+                                attention_mask,
+                                logits_to_keep,
+                                batch_size=batch_size,
+                                num_images=num_images,
+                                **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                            )
+                        )
+            else:
+                ref_per_token_logps = None
+
+        # Decode
+        prompts_text = self.processing_class.batch_decode(
+            prompt_ids, skip_special_tokens=True
+        )
+        completions_text = self.processing_class.batch_decode(
+            completion_ids, skip_special_tokens=True
+        )
+        if is_conversational(inputs[0]):
+            completions = []
+            for prompt, completion in zip(prompts, completions_text, strict=True):
+                bootstrap = (
+                    prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+                )
+                if isinstance(
+                    bootstrap, list
+                ):  # for VLM, the format might be [{"type": "text", "text": "..."}]
+                    assert len(bootstrap) == 1 and bootstrap[0]["type"] == "text"
+                    bootstrap = bootstrap[0]["text"]
+                completions.append(
+                    [{"role": "assistant", "content": bootstrap + completion}]
+                )
+        else:
+            completions = completions_text
+
+        # Calculate rewards for each reward function. rewards_per_func aggregates rewards across all processes. This is
+        # important because rewards will be normalized per group, and completions are distributed. We will later slice
+        # rewards_per_func to extract each process's subset.
+        rewards_per_func = self._calculate_rewards(
+            inputs, prompts, completions, completion_ids_list
+        )
+
+        # Apply weights to each reward function's output and sum
+        rewards = (
+            rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
+        ).nansum(dim=1)
+
+        num_in_group = self.num_generations
+        num_inputs_in_device = len(prompts)
+
+        if self.num_remains_in_group is not None and mode == "train":
+            num_in_group = self.num_remains_in_group
+
+            all_completions = gather_object(completions)
+
+            group_filter_scores = self.group_filter_func(
+                group_completions=[
+                    all_completions[i : i + 1 * self.num_generations]
+                    for i in range(len(all_completions) // self.num_generations)
+                ],
+                group_rewards=rewards.view(-1, self.num_generations).tolist(),
+            )
+            group_filter_scores = torch.tensor(group_filter_scores, device=device)
+
+            _, group_local_indices = torch.topk(
+                group_filter_scores, self.num_remains_in_group, dim=-1
+            )
+            group_row_offsets = torch.arange(
+                0, len(all_completions), self.num_generations, device=device
+            ).unsqueeze(1)
+            group_global_indices = group_row_offsets + group_local_indices
+            group_global_indices = group_global_indices.flatten()
+
+            rewards = rewards[group_global_indices].contiguous()
+            rewards_per_func = rewards_per_func[group_global_indices, :].contiguous()
+
+            num_inputs_in_device = int(
+                len(prompts) / self.num_generations * self.num_remains_in_group
+            )
+
+        # Compute grouped-wise rewards
+        mean_grouped_rewards = rewards.view(-1, num_in_group).mean(dim=1)
+
+        # Normalize the rewards to compute the advantages
+        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(
+            num_in_group, dim=0
+        )
+        advantages = rewards - mean_grouped_rewards
+
+        if self.scale_rewards in ["group", "none"]:
+            # If self.scale_rewards = "none", we'll still log group level std
+            std_rewards = rewards.view(-1, num_in_group).std(dim=1)
+            std_rewards = std_rewards.repeat_interleave(num_in_group, dim=0)
+        elif self.scale_rewards == "batch":
+            # Compute global std
+            std_rewards = rewards.std().expand_as(rewards)
+        else:
+            raise ValueError(
+                f"Invalid value for scale_rewards: {self.scale_rewards}. Must be one of 'batch', 'group', or 'none'."
+            )
+
+        is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards))
+        if self.scale_rewards != "none":
+            advantages = advantages / (std_rewards + 1e-4)
+
+        # Slice to keep only the local part of the data
+        process_slice = slice(
+            self.accelerator.process_index * num_inputs_in_device,
+            (self.accelerator.process_index + 1) * num_inputs_in_device,
+        )
+        all_process_advantages = (
+            advantages.clone()
+        )  # keep the aggregated advantages for logging
+        advantages = advantages[process_slice]
+
+        if self.num_remains_in_group is not None and mode == "train":
+            local_input_indices_to_keep = group_global_indices[
+                process_slice
+            ] - self.accelerator.process_index * len(
+                prompts
+            )  # step is length of prompts
+
+            prompt_ids = prompt_ids[local_input_indices_to_keep].contiguous()
+            prompt_mask = prompt_mask[local_input_indices_to_keep].contiguous()
+            completion_ids = completion_ids[local_input_indices_to_keep].contiguous()
+            completion_mask = completion_mask[local_input_indices_to_keep].contiguous()
+            attention_mask = attention_mask[local_input_indices_to_keep].contiguous()
+            completion_lengths = completion_mask.sum(1)
+            agg_completion_lengths = self.accelerator.gather(completion_lengths)
+            num_items_in_batch = agg_completion_lengths.sum()
+
+            if sampling_per_token_logps is not None:
+                sampling_per_token_logps = sampling_per_token_logps[
+                    local_input_indices_to_keep
+                ].contiguous()
+            if old_per_token_logps is not None:
+                old_per_token_logps = old_per_token_logps[
+                    local_input_indices_to_keep
+                ].contiguous()
+            if ref_per_token_logps is not None:
+                ref_per_token_logps = ref_per_token_logps[
+                    local_input_indices_to_keep
+                ].contiguous()
+            if self.use_vllm and self.vllm_importance_sampling_correction:
+                importance_sampling_ratio = importance_sampling_ratio[
+                    local_input_indices_to_keep
+                ].contiguous()
+
+        # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values)
+        for i, reward_func_name in enumerate(self.reward_func_names):
+            mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards)
+            std_func_rewards = nanstd(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/std"].append(
+                std_func_rewards
+            )
+        self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item())
+        self._metrics[mode]["reward_std"].append(std_rewards.mean().item())
+        self._metrics[mode]["frac_reward_zero_std"].append(
+            is_std_zero.float().mean().item()
+        )
+
+        # Log prompt and completion texts
+        all_prompts_text = gather_object(prompts_text)
+        all_completions_text = gather_object(completions_text)
+        all_images = gather_object(images) if images is not None else None
+        if self.num_remains_in_group is not None and mode == "train":
+            group_global_indices_list = group_global_indices.tolist()
+            all_prompts_text = [all_prompts_text[i] for i in group_global_indices_list]
+            all_completions_text = [
+                all_completions_text[i] for i in group_global_indices_list
+            ]
+            if images is not None:
+                all_images = [all_images[i] for i in group_global_indices_list]
+
+        self._logs["prompt"].extend(all_prompts_text)
+        self._logs["completion"].extend(all_completions_text)
+        for i, name in enumerate(self.reward_func_names):
+            self._logs["rewards"][name].extend(rewards_per_func[:, i].tolist())
+        self._logs["advantages"].extend(all_process_advantages.tolist())
+
+        if images is not None:
+            self._logs["images"].extend(all_images)
+
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            delta = torch.abs(old_per_token_logps - sampling_per_token_logps)
+            delta = delta[completion_mask.bool()]
+            mean_delta = (
+                torch.mean(delta)
+                if delta.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            max_delta = (
+                torch.max(delta)
+                if delta.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            self._metrics[mode]["sampling/sampling_logp_difference/mean"].append(
+                self.accelerator.gather(mean_delta).mean().item()
+            )
+            self._metrics[mode]["sampling/sampling_logp_difference/max"].append(
+                self.accelerator.gather(max_delta).max().item()
+            )
+
+            flat_is_ratio = importance_sampling_ratio[completion_mask.bool()]
+            min_importance_sampling_ratio = (
+                torch.min(flat_is_ratio)
+                if flat_is_ratio.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            mean_importance_sampling_ratio = (
+                torch.mean(flat_is_ratio)
+                if flat_is_ratio.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            max_importance_sampling_ratio = (
+                torch.max(flat_is_ratio)
+                if flat_is_ratio.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/min"].append(
+                nanmin(self.accelerator.gather(min_importance_sampling_ratio)).item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/mean"].append(
+                self.accelerator.gather(mean_importance_sampling_ratio).nanmean().item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/max"].append(
+                nanmax(self.accelerator.gather(max_importance_sampling_ratio)).item()
+            )
+
+        output = {
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "advantages": advantages,
+            "num_items_in_batch": num_items_in_batch,
+        }
+        if old_per_token_logps is not None:
+            output["old_per_token_logps"] = old_per_token_logps
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            output["importance_sampling_ratio"] = importance_sampling_ratio
+        if ref_per_token_logps is not None:
+            output["ref_per_token_logps"] = ref_per_token_logps
+        if "pixel_values" in forward_kwargs:
+            output["pixel_values"] = forward_kwargs["pixel_values"]
+        if "image_grid_thw" in forward_kwargs:
+            output["image_grid_thw"] = forward_kwargs["image_grid_thw"]
+        if "pixel_attention_mask" in forward_kwargs:
+            output["pixel_attention_mask"] = forward_kwargs["pixel_attention_mask"]
+        if "image_sizes" in forward_kwargs:
+            output["image_sizes"] = forward_kwargs["image_sizes"]
+        if "token_type_ids" in forward_kwargs:
+            output["token_type_ids"] = forward_kwargs["token_type_ids"]
+        if images is not None:
+            output["num_images"] = num_images
+        return output
diff --git a/src/aixpert/training/training/trl/experimental/gold/__init__.py b/src/aixpert/training/training/trl/experimental/gold/__init__.py
new file mode 100644
index 0000000..97b3b3c
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/gold/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .gold_config import GOLDConfig
+from .gold_trainer import GOLDTrainer
+
+
+__all__ = ["GOLDConfig", "GOLDTrainer"]
diff --git a/src/aixpert/training/training/trl/experimental/gold/gold.py b/src/aixpert/training/training/trl/experimental/gold/gold.py
new file mode 100644
index 0000000..3461556
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/gold/gold.py
@@ -0,0 +1,153 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# /// script
+# dependencies = [
+#     "trl @ git+https://github.com/huggingface/trl.git",
+#     "peft",
+#     "trackio",
+# ]
+# ///
+
+# docstyle-ignore
+"""
+# Full training:
+python trl/experimental/gold/gold.py \
+    --model_name_or_path meta-llama/Llama-3.2-1B-Instruct \
+    --teacher_model_name_or_path Qwen/Qwen2-1.5B-Instruct \
+    --dataset_name trl-lib/chatbot_arena_completions \
+    --learning_rate 2e-5 \
+    --per_device_train_batch_size 4 \
+    --gradient_accumulation_steps 8 \
+    --output_dir gold-model \
+    --num_train_epochs 1 \
+    --push_to_hub \
+    --gradient_checkpointing
+
+# LoRA:
+python trl/experimental/gold/gold.py \
+    --model_name_or_path meta-llama/Llama-3.2-1B-Instruct \
+    --teacher_model_name_or_path Qwen/Qwen2-1.5B-Instruct \
+    --dataset_name trl-lib/chatbot_arena_completions \
+    --learning_rate 2e-4 \
+    --per_device_train_batch_size 4 \
+    --gradient_accumulation_steps 8 \
+    --output_dir gold-model \
+    --num_train_epochs 1 \
+    --push_to_hub \
+    --gradient_checkpointing \
+    --use_peft \
+    --lora_r 64 \
+    --lora_alpha 16
+"""
+
+from datasets import load_dataset
+from transformers import AutoTokenizer, GenerationConfig
+
+from trl import (
+    LogCompletionsCallback,
+    ModelConfig,
+    ScriptArguments,
+    TrlParser,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+from trl.experimental.gold.gold_config import GOLDConfig
+from trl.experimental.gold.gold_trainer import GOLDTrainer
+
+
+if __name__ == "__main__":
+    parser = TrlParser((ScriptArguments, GOLDConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+
+    ################
+    # Model & Tokenizer
+    ################
+    quantization_config = get_quantization_config(model_args)
+    model_kwargs = dict(
+        revision=training_args.student_model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=model_args.dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    training_args.model_init_kwargs = model_kwargs
+
+    if (
+        training_args.teacher_tokenizer_name_or_path is None
+        and training_args.use_uld_loss
+    ):
+        training_args.teacher_tokenizer_name_or_path = (
+            training_args.teacher_model_name_or_path
+        )
+    teacher_model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=model_args.dtype,
+        use_cache=True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    training_args.teacher_model_init_kwargs = teacher_model_kwargs
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        padding_side="left",
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    ################
+    # Dataset
+    ################
+    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+
+    ################
+    # Training
+    ################
+    trainer = GOLDTrainer(
+        model=model_args.model_name_or_path,
+        teacher_model=training_args.teacher_model_name_or_path,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split]
+        if training_args.eval_strategy != "no"
+        else None,
+        processing_class=tokenizer,
+        peft_config=get_peft_config(model_args),
+    )
+
+    if training_args.eval_strategy != "no":
+        generation_config = GenerationConfig(
+            max_new_tokens=training_args.max_new_tokens,
+            do_sample=True,
+            temperature=training_args.temperature,
+        )
+        completions_callback = LogCompletionsCallback(
+            trainer, generation_config, num_prompts=8
+        )
+        trainer.add_callback(completions_callback)
+
+    trainer.train()
+
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
diff --git a/src/aixpert/training/training/trl/experimental/gold/gold_config.py b/src/aixpert/training/training/trl/experimental/gold/gold_config.py
new file mode 100644
index 0000000..20f1271
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/gold/gold_config.py
@@ -0,0 +1,451 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+from ...trainer.sft_config import SFTConfig
+
+
+@dataclass
+class GOLDConfig(SFTConfig):
+    r"""
+    Configuration class for [`GOLDTrainer`].
+
+    This class includes only the parameters that are specific to GOLD training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] and [`SFTConfig`] documentation.
+
+    Args:
+        temperature (`float`, *optional*, defaults to `0.9`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        lmbda (`float`, *optional*, defaults to `0.5`):
+            Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy
+            student-generated outputs).
+        beta (`float`, *optional*, defaults to `0.5`):
+            Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence loss. When
+            beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence.
+        max_completion_length (`int`, *optional*, defaults to `128`):
+            Maximum number of tokens to generate per completion.
+        teacher_model_name_or_path (`str` or `None`, *optional*, defaults to `None`):
+            Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being
+            trained.
+        teacher_model_init_kwargs (`dict[str, Any]]` or `None`, *optional*, defaults to `None`):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
+            from a string.
+        teacher_tokenizer_name_or_path (`str` or `None`, *optional*, defaults to `None`):
+            Tokenizer name or path for the teacher model. If None when using ULD loss, will use the same tokenizer as
+            the student model (not recommended for cross-tokenizer distillation).
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        seq_kd (`bool`, *optional*, defaults to `False`):
+            Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised FT on
+            teacher-generated output).
+        use_uld_loss (`bool`, *optional*, defaults to `False`):
+            Whether to use Universal Logit Distillation (ULD) loss instead of Generalized Jensen-Shannon Divergence
+            loss.
+        uld_crossentropy_weight (`float`, *optional*, defaults to `0.0`):
+            Weight for the cross-entropy loss component in ULD loss. If 0, only ULD distillation loss is used.
+        uld_distillation_weight (`float`, *optional*, defaults to `1.0`):
+            Weight for the distillation loss component in ULD loss.
+        uld_student_temperature (`float`, *optional*, defaults to `1.0`):
+            Temperature for student logits in ULD loss computation.
+        uld_teacher_temperature (`float`, *optional*, defaults to `1.0`):
+            Temperature for teacher logits in ULD loss computation.
+        uld_skip_student_eos (`bool`, *optional*, defaults to `True`):
+            Whether to skip EOS token for student in ULD loss computation.
+        uld_skip_teacher_eos (`bool`, *optional*, defaults to `True`):
+            Whether to skip EOS token for teacher in ULD loss computation.
+        use_vllm (`bool`, *optional*, defaults to `False`):
+            Whether to use vLLM for generating completions from the student model. Requires `vllm` to be installed.
+        vllm_mode (`str`, *optional*, defaults to `"server"`):
+            Mode for student vLLM integration. Either `"server"` (connect to a running TRL vLLM server) or `"colocate"`
+            (run vLLM in the same process).
+        vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`):
+            Host of the vLLM server for the student model (if `vllm_mode="server"`).
+        vllm_server_port (`int`, *optional*, defaults to `8001`):
+            Port of the vLLM server for the student model (if `vllm_mode="server"`).
+        vllm_server_timeout (`float`, *optional*, defaults to `240.0`):
+            Timeout for connecting to the student vLLM server (if `vllm_mode="server"`).
+        vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.9`):
+            GPU memory utilization for the colocated student vLLM engine (if `vllm_mode="colocate"`). It is recommended
+            to set this to a low value if the student and teacher models share the same GPU.
+        vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
+            Tensor parallel size for the colocated student vLLM engine (if `vllm_mode="colocate"`).
+        vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`):
+            Regex for vLLM guided decoding for the student model.
+        vllm_sync_frequency (`int`, *optional*, defaults to `1`):
+            Frequency (in training steps) to synchronize student model weights to vLLM engine. Set to 1 to sync after
+            every step.
+        vllm_enable_sleep_mode (`bool`, *optional*, defaults to `False`):
+            Enable vLLM sleep mode to offload student weights/cache during the optimizer step. Keeps GPU memory usage
+            low, but waking the engine adds host–device transfer latency.
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + [
+        "teacher_model_init_kwargs"
+    ]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-7,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+
+    # GOLD-specific parameters
+    temperature: float = field(
+        default=0.9,
+        metadata={
+            "help": "Temperature for sampling. The higher the temperature, the more random the completions."
+        },
+    )
+    top_p: float = field(
+        default=0.95,
+        metadata={
+            "help": "If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to "
+            "`top_p` or higher are kept for generation."
+        },
+    )
+    top_k: int = field(
+        default=0,
+        metadata={
+            "help": "The number of highest probability vocabulary tokens to keep for top-k-filtering."
+        },
+    )
+    lmbda: float = field(
+        default=0.5,
+        metadata={
+            "help": "Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy "
+            "student-generated outputs)."
+        },
+    )
+    beta: float = field(
+        default=0.5,
+        metadata={
+            "help": "Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence "
+            "loss. When beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL "
+            "Divergence."
+        },
+    )
+    max_completion_length: int = field(
+        default=128,
+        metadata={"help": "Maximum number of tokens to generate per completion."},
+    )
+    student_model_revision: str = field(
+        default="main",
+        metadata={
+            "help": "Revision of the student model to use. If not specified, the default revision of the model will be used."
+        },
+    )
+    teacher_model_name_or_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "Model name or path of the teacher model. If `None`, the teacher model will be the same as the "
+            "model being trained."
+        },
+    )
+    teacher_model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "teacher model from a string."
+        },
+    )
+    teacher_tokenizer_name_or_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "Tokenizer name or path for the teacher model. If None when using ULD loss, will use the same "
+            "tokenizer as the student model (not recommended for cross-tokenizer distillation)."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropouts in `model`."},
+    )
+    seq_kd: bool = field(
+        default=False,
+        metadata={
+            "help": "Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised "
+            "FT on teacher-generated output)."
+        },
+    )
+    steps_per_generation: int | None = field(
+        default=None,
+        metadata={
+            "help": "Number of optimization steps per generation. If `None`, it defaults to gradient_accumulation_steps."
+        },
+    )
+
+    # ULD Loss parameters
+    use_uld_loss: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use Universal Logit Distillation (ULD) loss instead of Generalized Jensen-Shannon Divergence loss."
+        },
+    )
+    use_extended_uld: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to enable extended ULD alignment that uses tokenizers to align and merge token "
+                "probabilities across student and teacher tokenizations. When True, the trainer will compute "
+                "token mappings and merge probabilities for split tokens; when False, ULD will use simple "
+                "positional truncation like in the original ULD paper."
+            )
+        },
+    )
+    uld_use_hybrid_loss: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to use a hybrid loss that combines ULD loss and JSD loss. When True, the final loss is a "
+                "a combination of JSD for known token mappings and ULD for unknown token mappings."
+            )
+        },
+    )
+    uld_hybrid_matched_weight: float | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Weight for the matched token loss component when using hybrid ULD + JSD loss. This weight scales "
+                "the JSD loss computed over tokens that have a direct mapping between student and teacher "
+                "tokenizations. If None, uses adaptive weighting based on vocabulary overlap. Must be set together "
+                "with uld_hybrid_unmatched_weight (both None or both float)."
+            )
+        },
+    )
+    uld_hybrid_unmatched_weight: float | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Weight for the unmatched token loss component when using hybrid ULD + JSD loss. This weight scales "
+                "the ULD loss computed over tokens that do not have a direct mapping between student and teacher "
+                "tokenizations. If None, uses adaptive weighting based on vocabulary overlap. Must be set together "
+                "with uld_hybrid_matched_weight (both None or both float)."
+            )
+        },
+    )
+    uld_crossentropy_weight: float = field(
+        default=0.0,
+        metadata={"help": "Weight for the cross-entropy loss component in ULD loss."},
+    )
+    uld_distillation_weight: float = field(
+        default=1.0,
+        metadata={"help": "Weight for the distillation loss component in ULD loss."},
+    )
+    uld_student_temperature: float = field(
+        default=1.0,
+        metadata={"help": "Temperature for student logits in ULD loss computation."},
+    )
+    uld_teacher_temperature: float = field(
+        default=1.0,
+        metadata={"help": "Temperature for teacher logits in ULD loss computation."},
+    )
+
+    uld_skip_student_eos: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to skip EOS token for student in ULD loss computation."
+        },
+    )
+    uld_skip_teacher_eos: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to skip EOS token for teacher in ULD loss computation."
+        },
+    )
+
+    # transformers paged attention
+    use_transformers_paged: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use the `transformers` paged implementation for generation. If set to `True`, the "
+            "`transformers` paged implementation will be used for generation instead of the default padded "
+            "implementation."
+        },
+    )
+
+    # vLLM parameters
+    use_vllm: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use vLLM for generating completions. Requires `vllm` to be installed."
+        },
+    )
+    vllm_mode: str = field(
+        default="server",
+        metadata={
+            "help": 'Mode for vLLM integration. Either "server" (connect to a running TRL vLLM server) or "colocate" (run vLLM in the same process).'
+        },
+    )
+    vllm_server_host: str = field(
+        default="0.0.0.0",
+        metadata={"help": 'Host of the vLLM server when `vllm_mode="server"`.'},
+    )
+    vllm_server_port: int = field(
+        default=8001,
+        metadata={"help": 'Port of the vLLM server when `vllm_mode="server"`.'},
+    )
+    vllm_server_timeout: float = field(
+        default=240.0,
+        metadata={
+            "help": 'Timeout (in seconds) for connecting to the vLLM server when `vllm_mode="server"`.'
+        },
+    )
+    vllm_gpu_memory_utilization: float = field(
+        default=0.9,
+        metadata={
+            "help": 'GPU memory utilization for the colocated vLLM engine when `vllm_mode="colocate"`. Lower values reduce contention when sharing a device with the student/teacher models.'
+        },
+    )
+    vllm_tensor_parallel_size: int = field(
+        default=1,
+        metadata={
+            "help": 'Tensor parallel size for the colocated vLLM engine when `vllm_mode="colocate"`.'
+        },
+    )
+    vllm_guided_decoding_regex: str | None = field(
+        default=None,
+        metadata={"help": "Regex pattern used for vLLM guided decoding (optional)."},
+    )
+    vllm_sync_frequency: int = field(
+        default=1,
+        metadata={
+            "help": "Frequency (in training steps) to synchronize model weights to the vLLM engine. Set to 1 to sync after every step."
+        },
+    )
+    vllm_enable_sleep_mode: bool = field(
+        default=False,
+        metadata={
+            "help": "Enable vLLM sleep mode to offload student weights/cache during the optimizer step. Keeps GPU "
+            "memory usage low, but waking the engine adds host–device transfer latency."
+        },
+    )
+    # Parameters that control the logging
+    log_completions: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is "
+            "installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`."
+        },
+    )
+    log_completions_steps: int = field(
+        default=100,
+        metadata={
+            "help": "Number of steps between logging (prompt, completion) pairs. Only used if `log_completions` is "
+            "set to `True`."
+        },
+    )
+    num_completions_to_print: int | None = field(
+        default=None,
+        metadata={
+            "help": "Number of completions to print with `rich`. If `None`, all completions are logged."
+        },
+    )
+    wandb_entity: str | None = field(
+        default=None,
+        metadata={"help": ("The entity to store runs under.")},
+    )
+    wandb_project: str | None = field(
+        default=None,
+        metadata={"help": ("The project to store runs under.")},
+    )
+    wandb_run_group: str | None = field(
+        default=None,
+        metadata={"help": ("The group to store runs under.")},
+    )
+    wandb_log_unique_prompts: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to log the unique prompts to wandb. This will create a new run for each unique prompt."
+            )
+        },
+    )
+    callbacks: list[str] = field(
+        default_factory=lambda: [],
+        metadata={"help": "The callbacks to run during training."},
+    )
+    hub_model_revision: str | None = field(
+        default="main", metadata={"help": "The Hub model branch to push the model to."}
+    )
+    num_completions_to_print: int = field(
+        default=5, metadata={"help": "Number of completions to print."}
+    )
+    overwrite_hub_revision: bool = field(
+        default=False, metadata={"help": "Whether to overwrite the Hub revision."}
+    )
+    push_to_hub_revision: bool = field(
+        default=False, metadata={"help": "Whether to push to a Hub revision/branch."}
+    )
+    trl_project: str = field(
+        default="smollm3",
+        metadata={
+            "help": "The TRL project to use for evaluation. This is used to determine the path to the evaluation script."
+        },
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+        # check lmbda and beta are in the range [0, 1]
+        if self.lmbda < 0.0 or self.lmbda > 1.0:
+            raise ValueError("lmbda must be in the range [0.0, 1.0].")
+        if self.beta < 0.0 or self.beta > 1.0:
+            raise ValueError("beta must be in the range [0.0, 1.0].")
+
+        # Validate that max_length is sufficient for max_completion_length
+        if (
+            self.max_length is not None
+            and self.max_completion_length >= self.max_length
+        ):
+            raise ValueError(
+                f"max_completion_length ({self.max_completion_length}) must be smaller than max_length ({self.max_length}) "
+                f"to leave room for the prompt. Consider increasing max_length or reducing max_completion_length."
+            )
+
+        if self.steps_per_generation is None:
+            self.steps_per_generation = self.gradient_accumulation_steps
+
+        # Validate ULD parameters
+        if self.use_uld_loss:
+            if self.uld_crossentropy_weight < 0.0:
+                raise ValueError("uld_crossentropy_weight must be non-negative.")
+            if self.uld_distillation_weight < 0.0:
+                raise ValueError("uld_distillation_weight must be non-negative.")
+            if self.uld_student_temperature <= 0.0:
+                raise ValueError("uld_student_temperature must be positive.")
+            if self.uld_teacher_temperature <= 0.0:
+                raise ValueError("uld_teacher_temperature must be positive.")
+
+            # Validate hybrid loss weights - both must be None or both must be set
+            if self.uld_use_hybrid_loss:
+                if (self.uld_hybrid_matched_weight is None) != (
+                    self.uld_hybrid_unmatched_weight is None
+                ):
+                    raise ValueError(
+                        "uld_hybrid_matched_weight and uld_hybrid_unmatched_weight must both be None (for adaptive "
+                        "weighting) or both be set to numeric values. Got uld_hybrid_matched_weight="
+                        f"{self.uld_hybrid_matched_weight} and uld_hybrid_unmatched_weight="
+                        f"{self.uld_hybrid_unmatched_weight}."
+                    )
+                if self.uld_hybrid_matched_weight is not None:
+                    if self.uld_hybrid_matched_weight < 0.0:
+                        raise ValueError(
+                            "uld_hybrid_matched_weight must be non-negative."
+                        )
+                    if self.uld_hybrid_unmatched_weight < 0.0:
+                        raise ValueError(
+                            "uld_hybrid_unmatched_weight must be non-negative."
+                        )
diff --git a/src/aixpert/training/training/trl/experimental/gold/gold_trainer.py b/src/aixpert/training/training/trl/experimental/gold/gold_trainer.py
new file mode 100644
index 0000000..d84fb56
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/gold/gold_trainer.py
@@ -0,0 +1,2451 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import textwrap
+import warnings
+from collections import defaultdict, deque
+from collections.abc import Callable
+from contextlib import nullcontext
+from typing import Any, Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from accelerate import PartialState
+from accelerate.utils import (
+    DistributedType,
+    broadcast_object_list,
+    gather_object,
+    is_peft_model,
+)
+from datasets import Dataset, IterableDataset
+from torch import nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from transformers import AutoTokenizer
+from transformers.data.data_collator import DataCollator
+from transformers.feature_extraction_utils import FeatureExtractionMixin
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.integrations.integration_utils import is_wandb_available
+from transformers.modeling_utils import PreTrainedModel
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
+from transformers.trainer_utils import EvalPrediction
+from transformers.utils import (
+    is_flash_attn_2_available,
+    is_liger_kernel_available,
+    is_peft_available,
+    is_rich_available,
+)
+
+from ...data_utils import (
+    is_conversational,
+    maybe_convert_to_chatml,
+    pack_dataset,
+    truncate_dataset,
+)
+from ...extras.profiling import profiling_decorator
+from ...extras.vllm_client import VLLMClient
+from ...import_utils import is_vllm_available
+from ...models import prepare_deepspeed
+from ...models.utils import unwrap_model_for_generation
+from ...trainer.sft_trainer import SFTTrainer
+from ...trainer.utils import (
+    DataCollatorForChatML,
+    create_model_from_path,
+    disable_dropout_in_model,
+    empty_cache,
+    ensure_master_addr_port,
+    pad,
+)
+from .gold_config import GOLDConfig
+
+
+if is_peft_available():
+    from peft import PeftConfig
+
+if is_wandb_available():
+    import wandb
+
+if is_vllm_available():
+    from vllm import LLM, SamplingParams
+    from vllm.sampling_params import GuidedDecodingParams
+
+if is_liger_kernel_available():
+    from liger_kernel.chunked_loss import LigerFusedLinearJSDLoss
+
+if is_rich_available():
+    from rich.console import Console
+    from rich.panel import Panel
+    from rich.table import Table
+    from rich.text import Text
+
+
+def print_prompt_completions_sample_uld(
+    prompts: list[str],
+    completions: list[str],
+    step: int,
+    num_samples: int = None,
+) -> None:
+    """
+    Print out a sample of model completions to the console with multiple reward metrics.
+
+    This function creates a nicely formatted table showing prompt-completion pairs, useful for monitoring model outputs
+    during training. It requires the `rich` library to be installed.
+
+    Args:
+        prompts (`list[str]`):
+            List of prompts.
+        completions (`list[str]`):
+            List of completions corresponding to the prompts.
+        rewards (`dict[str, list[float]]`):
+            Dictionary where keys are reward names and values are lists of rewards.
+        advantages (`list[float]`):
+            List of advantages corresponding to the prompts and completions.
+        step (`int`):
+            Current training step number, used in the output title.
+        num_samples (`int` or `None`, *optional*, defaults to `None`):
+            Number of random samples to display. If `None` (default), all items will be displayed.
+
+    Example:
+    ```python
+    >>> from trl.trainer.utils import print_prompt_completions_sample
+
+    >>> prompts = ["The sky is", "The sun is"]
+    >>> completions = [" blue.", " in the sky."]
+    >>> rewards = {"Correctness": [0.123, 0.456], "Format": [0.789, 0.101]}
+    >>> advantages = [0.987, 0.654]
+    >>> print_prompt_completions_sample(prompts, completions, rewards, advantages, 42)
+    ╭──────────────────────────── Step 42 ─────────────────────────────╮
+    │ ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━┓ │
+    │ ┃ Prompt     ┃ Completion   ┃ Correctness ┃ Format ┃ Advantage ┃ │
+    │ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━┩ │
+    │ │ The sky is │  blue.       │        0.12 │   0.79 │      0.99 │ │
+    │ ├────────────┼──────────────┼─────────────┼────────┼───────────┤ │
+    │ │ The sun is │  in the sky. │        0.46 │   0.10 │      0.65 │ │
+    │ └────────────┴──────────────┴─────────────┴────────┴───────────┘ │
+    ╰──────────────────────────────────────────────────────────────────╯
+    ```
+    """
+    if not is_rich_available():
+        raise ImportError(
+            "The function `print_prompt_completions_sample` requires the `rich` library. Please install it with "
+            "`pip install rich`."
+        )
+    console = Console()
+    table = Table(show_header=True, header_style="bold white", expand=True)
+
+    # Add columns
+    table.add_column("Prompt", style="bright_yellow")
+    table.add_column("Completion", style="bright_green")
+
+    # Some basic input validation
+    if num_samples is not None:
+        if num_samples >= len(prompts):
+            num_samples = None
+        elif num_samples <= 0:
+            return
+
+    # Subsample data if num_samples is specified
+    if num_samples is not None:
+        indices = random.sample(range(len(prompts)), num_samples)
+        prompts = [prompts[i] for i in indices]
+        completions = [completions[i] for i in indices]
+
+    for i in range(len(prompts)):
+        table.add_row(Text(prompts[i]), Text(completions[i]))
+        table.add_section()  # Adds a separator between rows
+
+    panel = Panel(table, expand=False, title=f"Step {step}", border_style="bold white")
+    console.print(panel)
+
+
+def build_teacher_inputs_from_texts(
+    tokenizer: PreTrainedTokenizerBase,
+    prompt_texts: list[str],
+    completion_texts: list[str],
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+    """Tokenize teacher prompts/completions and produce tensors ready for GOLD loss."""
+    pad_token_id = tokenizer.pad_token_id
+    eos_token_id = tokenizer.eos_token_id
+
+    prompt_token_ids = tokenizer(prompt_texts, add_special_tokens=True)["input_ids"]
+    completion_token_ids = tokenizer(completion_texts, add_special_tokens=False)[
+        "input_ids"
+    ]
+
+    sequences: list[torch.Tensor] = []
+    attention_masks: list[torch.Tensor] = []
+    labels_list: list[torch.Tensor] = []
+    prompt_lengths: list[int] = []
+
+    for prompt_ids, completion_ids in zip(
+        prompt_token_ids, completion_token_ids, strict=True
+    ):
+        # Remove trailing EOS from prompt so completions can extend cleanly
+        if eos_token_id is not None and prompt_ids and prompt_ids[-1] == eos_token_id:
+            prompt_ids = prompt_ids[:-1]
+
+        prompt_lengths.append(len(prompt_ids))
+        sequence = list(prompt_ids)
+        sequence.extend(completion_ids)
+        if eos_token_id is not None:
+            sequence.append(eos_token_id)
+
+        seq_tensor = torch.tensor(sequence, dtype=torch.long)
+        sequences.append(seq_tensor)
+        attention_masks.append(torch.ones_like(seq_tensor))
+
+        labels = seq_tensor.clone()
+        labels[: len(prompt_ids)] = -100
+        if pad_token_id is not None:
+            labels[labels == pad_token_id] = -100
+        labels_list.append(labels)
+
+    teacher_input_ids = pad(
+        sequences,
+        padding_side="right",
+        padding_value=pad_token_id if pad_token_id is not None else 0,
+    )
+    teacher_attention_mask = pad(
+        attention_masks, padding_side="right", padding_value=0
+    ).bool()
+    teacher_labels = pad(labels_list, padding_side="right", padding_value=-100)
+
+    if eos_token_id is not None:
+        for row in range(teacher_attention_mask.size(0)):
+            valid = (
+                teacher_input_ids[row] != pad_token_id
+                if pad_token_id is not None
+                else teacher_attention_mask[row].bool()
+            )
+            if valid.any():
+                last_idx = valid.nonzero(as_tuple=True)[0][-1]
+                teacher_attention_mask[row, last_idx + 1 :] = False
+
+    teacher_prompt_length = max(prompt_lengths) if prompt_lengths else 0
+
+    return (
+        teacher_input_ids,
+        teacher_labels,
+        teacher_attention_mask,
+        teacher_prompt_length,
+    )
+
+
+class ULDLoss(nn.Module):
+    """
+    Universal Logit Distillation Loss.
+    """
+
+    def __init__(
+        self, config: GOLDConfig, student_tokenizer=None, teacher_tokenizer=None
+    ):
+        super().__init__()
+        self.crossentropy_weight = config.uld_crossentropy_weight
+        self.distillation_weight = config.uld_distillation_weight
+        self.student_temperature = config.uld_student_temperature
+        self.teacher_temperature = config.uld_teacher_temperature
+        self.skip_student_eos = config.uld_skip_student_eos
+        self.skip_teacher_eos = config.uld_skip_teacher_eos
+        self.use_extended_uld = config.use_extended_uld
+        self.ignore_index = -100
+
+        # Add tokenizers for enhanced alignment
+        self.student_tokenizer = student_tokenizer
+        self.teacher_tokenizer = teacher_tokenizer
+
+        # Hybrid ULD configuration
+        self.use_hybrid_loss = getattr(config, "uld_use_hybrid_loss", False)
+        self.hybrid_matched_weight = getattr(config, "uld_hybrid_matched_weight", None)
+        self.hybrid_unmatched_weight = getattr(
+            config, "uld_hybrid_unmatched_weight", None
+        )
+        self.beta = getattr(
+            config, "beta", 1.0
+        )  # For JSD loss in hybrid matched tokens
+
+        # Initialize vocabulary mapping for hybrid loss
+        self._vocab_mapping = None
+        self._teacher_matched_ids = None
+        self._student_matched_ids = None
+        if (
+            self.use_hybrid_loss
+            and student_tokenizer is not None
+            and teacher_tokenizer is not None
+        ):
+            self._initialize_vocabulary_mapping()
+
+    def __call__(
+        self,
+        student_logits,
+        teacher_logits,
+        student_labels,
+        teacher_labels,
+        student_input_ids,
+        teacher_input_ids,
+    ):
+        """
+        Compute ULD loss with GKD trainer interface.
+
+        Args:
+            student_logits: Student model logits [batch_size, seq_len, vocab_size]
+            teacher_logits: Teacher model logits [batch_size, seq_len, vocab_size]
+            student_labels: Student target labels [batch_size, seq_len]
+            teacher_labels: Teacher target labels [batch_size, seq_len]
+            student_input_ids: Student input token IDs [batch_size, seq_len]
+            teacher_input_ids: Teacher input token IDs [batch_size, seq_len]
+
+        Returns
+        -------
+            Total loss (cross-entropy + distillation)
+        """
+        # Compute cross-entropy loss for student
+        if self.crossentropy_weight > 0:
+            shift_logits = student_logits[..., :-1, :].contiguous()
+            shift_labels = student_labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss(ignore_index=self.ignore_index)
+            crossentropy_loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+            )
+            crossentropy_loss = self.crossentropy_weight * crossentropy_loss
+        else:
+            crossentropy_loss = 0.0
+
+        # Compute distillation loss using ULD approximation
+        distillation_loss = self._compute_distillation_loss(
+            student_logits,
+            teacher_logits,
+            student_labels,
+            teacher_labels,
+            student_input_ids,
+            teacher_input_ids,
+        )
+
+        return crossentropy_loss + distillation_loss
+
+    def _initialize_vocabulary_mapping(self):
+        """Initialize vocabulary mapping for hybrid ULD loss."""
+        # Computing vocabulary mapping for hybrid ULD
+
+        student_vocab = self.student_tokenizer.get_vocab()
+        teacher_vocab = self.teacher_tokenizer.get_vocab()
+
+        # Create reverse mapping for student
+        student_token_to_id = dict(student_vocab.items())
+
+        vocab_mapping = {}
+        teacher_matched_ids = set()
+        student_matched_ids = set()
+
+        for token_str, teacher_id in teacher_vocab.items():
+            if token_str in student_token_to_id:
+                student_id = student_token_to_id[token_str]
+                vocab_mapping[teacher_id] = student_id
+                teacher_matched_ids.add(teacher_id)
+                student_matched_ids.add(student_id)
+
+        self._vocab_mapping = vocab_mapping
+        self._teacher_matched_ids = teacher_matched_ids
+        self._student_matched_ids = student_matched_ids
+
+    def _compute_distillation_loss(
+        self,
+        student_logits,
+        teacher_logits,
+        student_labels,
+        teacher_labels,
+        student_input_ids,
+        teacher_input_ids,
+    ):
+        """
+        Compute the Universal Logit Distillation loss with token mapping.
+
+        This version uses actual input_ids for accurate token mapping and multiplies probabilities for split tokens.
+        Both student_input_ids and teacher_input_ids are required for optimal alignment.
+        """
+        # Get answer regions (same as original)
+        student_answer_index, student_answer_size = self._get_start_and_size_answers(
+            student_labels
+        )
+        teacher_answer_index, teacher_answer_size = self._get_start_and_size_answers(
+            teacher_labels
+        )
+
+        if self.skip_student_eos:
+            student_answer_size = [size - 1 for size in student_answer_size]
+        if self.skip_teacher_eos:
+            teacher_answer_size = [size - 1 for size in teacher_answer_size]
+
+        # Handle edge case where all answer sizes are 0
+        if (
+            not student_answer_size
+            or not teacher_answer_size
+            or max(max(student_answer_size), max(teacher_answer_size)) <= 0
+        ):
+            return (
+                torch.zeros(1, device=student_logits.device, requires_grad=True)
+                * student_logits.sum()
+                * 1e-8
+            )
+
+        batch_size = student_logits.size(0)
+        distillation_losses = []
+
+        for i in range(batch_size):
+            # Get answer regions for this batch item
+            student_start = student_answer_index[i]
+            student_size = student_answer_size[i]
+            teacher_start = teacher_answer_index[i]
+            teacher_size = teacher_answer_size[i]
+
+            if student_size <= 0 or teacher_size <= 0:
+                loss_i = student_logits[i].sum() * 0.0
+                distillation_losses.append(loss_i)
+                continue
+
+            # Extract answer logits
+            student_answer_logits = student_logits[
+                i, student_start : student_start + student_size
+            ]
+            teacher_answer_logits = teacher_logits[
+                i, teacher_start : teacher_start + teacher_size
+            ]
+
+            # Convert to probabilities
+            student_probs = F.softmax(
+                student_answer_logits / self.student_temperature, dim=-1
+            )
+            teacher_probs = F.softmax(
+                teacher_answer_logits / self.teacher_temperature, dim=-1
+            )
+
+            # Get token IDs for mapping (always use actual input_ids)
+            student_token_ids = student_input_ids[
+                i, student_start : student_start + student_size
+            ].tolist()
+            teacher_token_ids = teacher_input_ids[
+                i, teacher_start : teacher_start + teacher_size
+            ].tolist()
+
+            if self.use_extended_uld:
+                # Build alignment groups directly from token ids using greedy text matching
+                student_alignment_groups, teacher_alignment_groups = (
+                    self._build_alignment_groups_from_ids(
+                        student_token_ids, teacher_token_ids
+                    )
+                )
+
+                # Merge student probabilities using student alignment groups
+                student_aligned = self._merge_probabilities_with_alignment_groups(
+                    student_probs, student_alignment_groups
+                )
+
+                # Merge teacher probabilities using teacher alignment groups
+                teacher_aligned = self._merge_probabilities_with_alignment_groups(
+                    teacher_probs, teacher_alignment_groups
+                )
+            else:
+                min_length = min(len(student_token_ids), len(teacher_token_ids))
+                student_aligned = student_probs[:min_length, :]
+                teacher_aligned = teacher_probs[:min_length, :]
+
+            # Apply ULD loss computation
+            if self.use_hybrid_loss and self._vocab_mapping is not None:
+                # Use hybrid approach: direct comparison for matched tokens, sorting for unmatched
+                aligned_loss = self._compute_hybrid_uld_loss(
+                    student_aligned, teacher_aligned
+                )
+            else:
+                # Original approach: sort all probabilities
+                student_sorted = student_aligned.sort(dim=-1, descending=True).values
+                teacher_sorted = teacher_aligned.sort(dim=-1, descending=True).values
+
+                # Pad vocabularies to same size
+                student_vocab_size = student_sorted.size(-1)
+                teacher_vocab_size = teacher_sorted.size(-1)
+                max_vocab_size = max(student_vocab_size, teacher_vocab_size)
+
+                if student_vocab_size < max_vocab_size:
+                    student_sorted = F.pad(
+                        student_sorted, (0, max_vocab_size - student_vocab_size)
+                    )
+                if teacher_vocab_size < max_vocab_size:
+                    teacher_sorted = F.pad(
+                        teacher_sorted, (0, max_vocab_size - teacher_vocab_size)
+                    )
+
+                # Compute L1 distance (ULD approach)
+                aligned_loss = F.l1_loss(
+                    student_sorted, teacher_sorted, reduction="sum"
+                )
+                aligned_loss /= student_aligned.size(0)  # Normalize by sequence length
+            distillation_losses.append(aligned_loss)
+
+        distillation_loss = torch.stack(distillation_losses).mean()
+        return self.distillation_weight * distillation_loss
+
+    def _build_alignment_groups_from_ids(self, student_token_ids, teacher_token_ids):
+        """
+        Build alignment groups using a greedy substring-equality algorithm on decoded token pieces.
+
+        Args:
+            student_token_ids: List[int]
+            teacher_token_ids: List[int]
+
+        Returns
+        -------
+            Tuple[List[List[int]], List[List[int]]]: student and teacher alignment groups
+        """
+
+        def to_canonical_pieces(tok, ids):
+            pieces = []
+            prev = ""
+            for k in range(len(ids)):
+                # IMPORTANT: Do NOT skip special tokens - we need to align them too
+                cur = tok.decode(
+                    ids[: k + 1],
+                    skip_special_tokens=False,
+                    clean_up_tokenization_spaces=False,
+                )
+                # Extract the incremental addition (may include spaces/ZWJ/etc.)
+                pieces.append(cur[len(prev) :])
+                prev = cur
+            return pieces
+
+        s_pieces = to_canonical_pieces(self.student_tokenizer, student_token_ids)
+        t_pieces = to_canonical_pieces(self.teacher_tokenizer, teacher_token_ids)
+
+        i = j = 0
+        s_buf = t_buf = ""
+        s_group = []
+        t_group = []
+        s_groups = []
+        t_groups = []
+
+        def flush():
+            if s_group and t_group:
+                s_groups.append(s_group.copy())
+                t_groups.append(t_group.copy())
+
+        # Greedily accumulate pieces until substrings match, then flush
+        while i < len(s_pieces) or j < len(t_pieces):
+            if s_buf == t_buf and s_buf != "":
+                flush()
+                s_buf = t_buf = ""
+                s_group = []
+                t_group = []
+                continue
+
+            if s_buf == "" and i < len(s_pieces):
+                s_buf += s_pieces[i]
+                s_group.append(i)
+                i += 1
+                continue
+            if t_buf == "" and j < len(t_pieces):
+                t_buf += t_pieces[j]
+                t_group.append(j)
+                j += 1
+                continue
+
+            if len(s_buf) <= len(t_buf):
+                if i < len(s_pieces):
+                    s_buf += s_pieces[i]
+                    s_group.append(i)
+                    i += 1
+                elif j < len(t_pieces):
+                    t_buf += t_pieces[j]
+                    t_group.append(j)
+                    j += 1
+            elif j < len(t_pieces):
+                t_buf += t_pieces[j]
+                t_group.append(j)
+                j += 1
+            elif i < len(s_pieces):
+                s_buf += s_pieces[i]
+                s_group.append(i)
+                i += 1
+
+        # Flush any remainder if both sides accumulated something
+        if s_buf == t_buf and s_group and t_group:
+            flush()
+        elif s_group or t_group:
+            # Handle remaining unmatched tokens by forcing a flush
+            # This ensures both sides have the same number of alignment groups
+            if s_group or t_group:
+                # Ensure both groups have content (even if empty list)
+                if not s_group:
+                    s_group = []
+                if not t_group:
+                    t_group = []
+                # Force flush even if buffers don't match
+                if s_group or t_group:
+                    s_groups.append(s_group.copy() if s_group else [])
+                    t_groups.append(t_group.copy() if t_group else [])
+
+        return s_groups, t_groups
+
+    def _merge_probabilities_with_alignment_groups(self, probs, alignment_groups):
+        """
+        Merge probabilities based on alignment groups.
+
+        Args:
+            probs: Probability tensor [seq_len, vocab_size]
+            alignment_groups: List of alignment groups (each group is a list of positions to merge)
+
+        Returns
+        -------
+            Merged probability tensor [num_groups, vocab_size]
+        """
+        if not alignment_groups:
+            return probs
+
+        # Create aligned tensor
+        vocab_size = probs.size(-1)
+        target_len = len(alignment_groups)
+        aligned_probs = torch.zeros(target_len, vocab_size, device=probs.device)
+
+        # Process each alignment group
+        for group_idx, group in enumerate(alignment_groups):
+            # Handle probability merging
+            if len(group) > 1:
+                # Multiple tokens map to this group - merge them
+                eps = 1e-8
+                logp = torch.log(probs[group[0]].clamp_min(eps))
+                for idx in group[1:]:
+                    if idx < probs.size(0):
+                        logp = logp + torch.log(probs[idx].clamp_min(eps))
+                aligned_probs[group_idx] = torch.softmax(logp, dim=-1)
+            elif len(group) == 1:
+                aligned_probs[group_idx] = probs[group[0]]
+            else:
+                # No tokens map to this group
+                aligned_probs[group_idx] = torch.zeros_like(probs[0])
+
+        return aligned_probs
+
+    def _compute_hybrid_uld_loss(self, student_aligned, teacher_aligned):
+        """
+        Compute hybrid ULD loss on aligned probability distributions. This method:
+        1. Directly compares probabilities for tokens with matching vocabulary entries
+        2. Uses sorting approach only for tokens with different vocabulary entries
+
+        Args:
+            student_aligned: Aligned student probabilities [seq_len, student_vocab_size]
+            teacher_aligned: Aligned teacher probabilities [seq_len, teacher_vocab_size]
+
+        Returns
+        -------
+            Combined hybrid loss
+        """
+        device = student_aligned.device
+        # seq_len = student_aligned.size(0)  # Unused variable
+        student_vocab_size = student_aligned.size(-1)
+        teacher_vocab_size = teacher_aligned.size(-1)
+
+        # Convert sets to sorted tensors for indexing
+        if self._teacher_matched_ids:
+            teacher_matched_indices = torch.tensor(
+                sorted(self._teacher_matched_ids), dtype=torch.long, device=device
+            )
+            student_matched_indices = torch.tensor(
+                [self._vocab_mapping[tid.item()] for tid in teacher_matched_indices],
+                dtype=torch.long,
+                device=device,
+            )
+        else:
+            teacher_matched_indices = torch.tensor([], dtype=torch.long, device=device)
+            student_matched_indices = torch.tensor([], dtype=torch.long, device=device)
+
+        # Create masks for unmatched tokens
+        teacher_matched_mask = torch.zeros(
+            teacher_vocab_size, dtype=torch.bool, device=device
+        )
+        student_matched_mask = torch.zeros(
+            student_vocab_size, dtype=torch.bool, device=device
+        )
+
+        if len(teacher_matched_indices) > 0:
+            teacher_matched_mask[teacher_matched_indices] = True
+            student_matched_mask[student_matched_indices] = True
+
+        # 1. JSD loss for matched vocabulary tokens (direct semantic correspondence)
+        matched_loss = torch.tensor(0.0, device=device)
+        matched_token_count = 0
+        if len(teacher_matched_indices) > 0:
+            # Extract probabilities for matched tokens
+            teacher_matched_probs = teacher_aligned[
+                :, teacher_matched_indices
+            ]  # [seq_len, num_matched]
+            student_matched_probs = student_aligned[
+                :, student_matched_indices
+            ]  # [seq_len, num_matched]
+            matched_token_count = teacher_matched_probs.size(-1)
+
+            # Use JSD loss for semantically aligned tokens
+            # Convert probabilities back to logits for JSD computation
+
+            # Apply generalized JSD loss to matched tokens
+            matched_loss = self._compute_jsd_loss_for_matched_tokens(
+                student_matched_probs, teacher_matched_probs
+            )
+
+        # 2. Sorted comparison loss for unmatched vocabulary tokens
+        teacher_unmatched_mask = ~teacher_matched_mask
+        student_unmatched_mask = ~student_matched_mask
+
+        teacher_unmatched_probs = teacher_aligned[
+            :, teacher_unmatched_mask
+        ]  # [seq_len, num_teacher_unmatched]
+        student_unmatched_probs = student_aligned[
+            :, student_unmatched_mask
+        ]  # [seq_len, num_student_unmatched]
+
+        unmatched_loss = torch.tensor(0.0, device=device)
+        if (
+            teacher_unmatched_probs.size(-1) > 0
+            and student_unmatched_probs.size(-1) > 0
+        ):
+            # Sort unmatched probabilities
+            teacher_unmatched_sorted = teacher_unmatched_probs.sort(
+                dim=-1, descending=True
+            ).values
+            student_unmatched_sorted = student_unmatched_probs.sort(
+                dim=-1, descending=True
+            ).values
+
+            # Pad to same size if needed
+            teacher_unmatched_size = teacher_unmatched_sorted.size(-1)
+            student_unmatched_size = student_unmatched_sorted.size(-1)
+            max_unmatched_size = max(teacher_unmatched_size, student_unmatched_size)
+
+            if teacher_unmatched_size < max_unmatched_size:
+                teacher_unmatched_sorted = F.pad(
+                    teacher_unmatched_sorted,
+                    (0, max_unmatched_size - teacher_unmatched_size),
+                )
+            if student_unmatched_size < max_unmatched_size:
+                student_unmatched_sorted = F.pad(
+                    student_unmatched_sorted,
+                    (0, max_unmatched_size - student_unmatched_size),
+                )
+
+            # L1 loss on sorted unmatched tokens
+            unmatched_loss = F.l1_loss(
+                student_unmatched_sorted, teacher_unmatched_sorted, reduction="sum"
+            )
+            unmatched_loss /= student_aligned.size(0)  # Normalize by sequence length
+
+        # 3. Combine losses with weights
+        if self.hybrid_matched_weight is None:
+            # Use adaptive weighting based on vocabulary overlap
+            hybrid_matched_weight = matched_token_count / max(1, teacher_vocab_size)
+            hybrid_unmatched_weight = 1.0 - hybrid_matched_weight
+        else:
+            # Use fixed weights provided in config
+            hybrid_matched_weight = self.hybrid_matched_weight
+            hybrid_unmatched_weight = self.hybrid_unmatched_weight
+
+        total_loss = (
+            hybrid_matched_weight * matched_loss
+            + hybrid_unmatched_weight * unmatched_loss
+        )
+
+        # Store matched/unmatched components for logging
+        self.last_matched_loss = matched_loss
+        self.last_unmatched_loss = unmatched_loss
+
+        return total_loss
+
+    def _compute_jsd_loss_for_matched_tokens(self, student_logits, teacher_logits):
+        """
+        Compute JSD loss for matched vocabulary tokens.
+
+        Args:
+            student_logits: Student logits for matched tokens [seq_len, num_matched]
+            teacher_logits: Teacher logits for matched tokens [seq_len, num_matched]
+
+        Returns
+        -------
+            JSD loss for matched tokens
+        """
+        # Reshape to [batch_size * seq_len, vocab_size] format expected by generalized_jsd_loss
+        batch_seq_len, num_matched = student_logits.shape
+
+        student_logits_reshaped = student_logits.view(-1, num_matched)
+        teacher_logits_reshaped = teacher_logits.view(-1, num_matched)
+
+        # Use the GOLD generalized JSD loss implementation that accepts probability inputs
+        jsd_loss = GOLDTrainer.generalized_jsd_loss(
+            student_logits_reshaped,
+            teacher_logits_reshaped,
+            labels=None,  # No masking needed for matched tokens
+            beta=self.beta,  # Standard JSD beta
+            temperature=1.0,  # Already applied in main computation
+            reduction="batchmean",
+            logits_are_probs=True,
+        )
+
+        return jsd_loss
+
+    def _get_start_and_size_answers(self, answer_tensors):
+        answers_index = []
+        answers_size = []
+
+        for answer in answer_tensors:
+            answer_mask = answer.ne(self.ignore_index)
+            if not answer_mask.any():
+                answers_index.append(0)
+                answers_size.append(0)
+                continue
+
+            valid_indices = answer_mask.nonzero(as_tuple=True)[0]
+            answers_index.append(int(valid_indices[0].item()))
+            answers_size.append(int(answer_mask.sum().item()))
+        return answers_index, answers_size
+
+
+class GOLDVLLMSyncCallback(TrainerCallback):
+    """Sync the model weights to vLLM after training steps when it's safe to do so."""
+
+    def __init__(self, trainer):
+        self.trainer = trainer
+
+    def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
+        """Sync weights after training step when DeepSpeed is stable."""
+        if (
+            self.trainer.use_vllm
+            and state.global_step != self.trainer._last_vllm_sync_step
+            and state.global_step % self.trainer.vllm_sync_frequency == 0
+        ):
+            # Check if this is a step where gradients are synchronized
+            # This happens at the end of gradient accumulation cycles
+            if (
+                hasattr(self.trainer.accelerator, "sync_gradients")
+                and self.trainer.accelerator.sync_gradients
+            ):
+                self.trainer._move_model_to_vllm()
+                self.trainer._last_vllm_sync_step = state.global_step
+
+
+class GOLDTrainer(SFTTrainer):
+    _tag_names = ["trl", "gold"]
+    _name = "GOLD"
+    _paper = {
+        "title": "Unlocking On-Policy Distillation for Any Model Family",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @misc{patino2025unlocking,
+                title        = {{Unlocking On-Policy Distillation for Any Model Family}},
+                author       = {Carlos Miguel Patiño and Kashif Rasul and Quentin Gallouédec and Ben Burtenshaw and Sergio Paniego and Vaibhav Srivastav and Thibaud Frere and Ed Beeching and Lewis Tunstall and Leandro von Werra and Thomas Wolf},
+                year         = 2025,
+                url          = {https://huggingface.co/spaces/HuggingFaceH4/general-on-policy-logit-distillation},
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str | None = None,
+        teacher_model: PreTrainedModel | nn.Module | str = None,
+        args: GOLDConfig | None = None,
+        data_collator: DataCollator | None = None,  # type: ignore
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+        peft_config: Optional["PeftConfig"] = None,
+    ):
+        self.model_name_or_path = (
+            model if isinstance(model, str) else model.config._name_or_path
+        )
+        self.model_revision = getattr(args, "student_model_revision", None)
+        if isinstance(model, str) and self.model_revision is not None:
+            args.model_init_kwargs = args.model_init_kwargs or {}
+            args.model_init_kwargs.setdefault("revision", self.model_revision)
+
+        # Respect a user-provided data_collator; otherwise, provide a ChatML collator that
+        if data_collator is None:
+            data_collator = DataCollatorForChatML(
+                tokenizer=processing_class, max_length=args.max_length
+            )
+
+        # Liger fused GKD loss (JSD)
+        self.use_liger_gkd_loss = False
+        if args.use_liger_kernel:
+            self.liger_jsd_loss = LigerFusedLinearJSDLoss(
+                beta=args.beta,
+                ignore_index=-100,
+                temperature=args.temperature,
+                compiled=False,
+            )
+            self.use_liger_gkd_loss = True
+
+        if args.teacher_model_init_kwargs is None:
+            teacher_model_init_kwargs = {}
+        elif not isinstance(teacher_model, str):
+            raise ValueError(
+                "You passed teacher_model_init_kwargs to the GOLDConfig, but your teacher_model is already instantiated."
+            )
+        else:
+            teacher_model_init_kwargs = args.teacher_model_init_kwargs
+            teacher_model_init_kwargs["torch_dtype"] = (
+                teacher_model_init_kwargs["torch_dtype"]
+                if teacher_model_init_kwargs["torch_dtype"] in ["auto", None]
+                else getattr(torch, teacher_model_init_kwargs["torch_dtype"])
+            )
+
+        if args.use_uld_loss and args.teacher_tokenizer_name_or_path is None:
+            if isinstance(teacher_model, str):
+                args.teacher_tokenizer_name_or_path = teacher_model
+            else:
+                raise ValueError(
+                    "`teacher_tokenizer_name_or_path` must be set when using ULD loss with a pre-instantiated teacher model."
+                )
+
+        if isinstance(teacher_model, str):
+            init_kwargs = dict(teacher_model_init_kwargs)
+            if "torch_dtype" in init_kwargs and "dtype" not in init_kwargs:
+                init_kwargs["dtype"] = init_kwargs.pop("torch_dtype")
+            teacher_model = create_model_from_path(teacher_model, **init_kwargs)
+        self.use_uld_loss = args.use_uld_loss
+        self.teacher_tokenizer = None
+        if args.use_uld_loss and args.teacher_tokenizer_name_or_path is not None:
+            self.teacher_tokenizer = AutoTokenizer.from_pretrained(
+                args.teacher_tokenizer_name_or_path
+            )
+            if (
+                not hasattr(self.teacher_tokenizer, "pad_token")
+                or self.teacher_tokenizer.pad_token is None
+            ):
+                self.teacher_tokenizer.pad_token = self.teacher_tokenizer.eos_token
+
+        # Hybrid ULD loss configuration is handled in ULDLoss class
+
+        super().__init__(
+            model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+            peft_config=peft_config,
+        )
+
+        if args.disable_dropout:
+            disable_dropout_in_model(self.model)
+        if not args.use_uld_loss:
+            teacher_model.resize_token_embeddings(self.model.config.vocab_size)
+
+        if self.is_deepspeed_enabled:
+            self.teacher_model = prepare_deepspeed(teacher_model, self.accelerator)
+        else:
+            self.teacher_model = self.accelerator.prepare_model(
+                teacher_model, evaluation_mode=True
+            )
+
+        self.lmbda = args.lmbda
+        self.beta = args.beta
+        self.temperature = args.temperature
+        self.top_p = args.top_p
+        self.seq_kd = args.seq_kd
+
+        # Track per-step loss statistics for on/off-policy batches (used in logging)
+        self._on_policy_loss_total = 0.0
+        self._off_policy_loss_total = 0.0
+        self._on_policy_step_equiv = 0.0
+        self._off_policy_step_equiv = 0.0
+
+        # Hybrid ULD matched/unmatched accumulators (logged every step when ULD hybrid is used)
+        self._matched_sum = 0.0
+        self._unmatched_sum = 0.0
+        self._matched_step_eq = 0.0
+        self._unmatched_step_eq = 0.0
+
+        self.use_transformers_paged = args.use_transformers_paged or False
+
+        self.uld_loss_fn = None
+        if self.use_uld_loss:
+            self.uld_loss_fn = ULDLoss(
+                config=args,
+                student_tokenizer=processing_class,
+                teacher_tokenizer=self.teacher_tokenizer,
+            )
+
+        self.generation_config = GenerationConfig(
+            max_new_tokens=args.max_completion_length,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            do_sample=True,
+            top_k=args.top_k,
+            pad_token_id=self.processing_class.pad_token_id,
+        )
+        if (
+            hasattr(self.model.generation_config, "eos_token_id")
+            and self.model.generation_config.eos_token_id is not None
+        ):
+            self.generation_config.eos_token_id = (
+                self.model.generation_config.eos_token_id
+            )
+
+        # Initialize the metrics
+        self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)}
+        self._total_train_tokens = 0
+        self.log_completions = args.log_completions
+        self.log_completion_steps = args.log_completions_steps
+        self.wandb_log_unique_prompts = args.wandb_log_unique_prompts
+        self.num_completions_to_print = args.num_completions_to_print
+        # maxlen is set to the total number of forward passes per step. This value of `maxlen` ensures we log only the
+        # final optimization step.
+        maxlen = (
+            self.accelerator.num_processes
+            * args.per_device_train_batch_size
+            * args.steps_per_generation
+        )
+        self._textual_logs = {
+            "prompt": deque(maxlen=maxlen),
+            "completion": deque(maxlen=maxlen),
+            "rewards": defaultdict(lambda: deque(maxlen=maxlen)),
+            "advantages": deque(maxlen=maxlen),
+        }
+
+        self.use_vllm = args.use_vllm
+        if self.use_vllm:
+            if not is_vllm_available():
+                raise ImportError(
+                    "vLLM is not available and use_vllm is set to True. Please install vLLM with "
+                    "`pip install vllm` to use it."
+                )
+            self.vllm_mode = args.vllm_mode
+            self.vllm_tensor_parallel_size = args.vllm_tensor_parallel_size
+            self.vllm_gpu_memory_utilization = args.vllm_gpu_memory_utilization
+            self.vllm_enable_sleep_mode = args.vllm_enable_sleep_mode
+            if self.vllm_mode == "server":
+                if self.accelerator.is_main_process:
+                    self.vllm_client = VLLMClient(
+                        host=args.vllm_server_host,
+                        server_port=args.vllm_server_port,
+                        connection_timeout=args.vllm_server_timeout,
+                    )
+                    self.vllm_client.init_communicator()
+            elif self.vllm_mode == "colocate":
+                student_model_name_or_path = self.model_name_or_path
+
+                # Make sure tensor_parallel_size divides world size evenly
+                if (
+                    not self.accelerator.num_processes % self.vllm_tensor_parallel_size
+                    == 0
+                ):
+                    raise ValueError(
+                        f"vllm_tensor_parallel_size ({self.vllm_tensor_parallel_size}) must divide world size "
+                        f"({self.accelerator.num_processes}) evenly."
+                    )
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Create subgroups of ranks for TP
+                    self.vllm_tp_group, _ = (
+                        torch.distributed.new_subgroups_by_enumeration(
+                            [
+                                list(
+                                    range(
+                                        i * self.vllm_tensor_parallel_size,
+                                        (i + 1) * self.vllm_tensor_parallel_size,
+                                    )
+                                )
+                                for i in range(
+                                    self.accelerator.num_processes
+                                    // self.vllm_tensor_parallel_size
+                                )
+                            ]
+                        )
+                    )
+
+                # vLLM requires the environment variables to be set for distributed training.
+                os.environ["RANK"] = str(self.accelerator.process_index)
+                os.environ["LOCAL_RANK"] = str(self.accelerator.local_process_index)
+                os.environ["WORLD_SIZE"] = str(self.accelerator.num_processes)
+                ensure_master_addr_port()
+
+                self.vllm_engine = LLM(
+                    model=student_model_name_or_path,
+                    revision=self.model_revision,
+                    tensor_parallel_size=self.vllm_tensor_parallel_size,
+                    gpu_memory_utilization=self.vllm_gpu_memory_utilization,
+                    max_num_seqs=self.args.per_device_train_batch_size
+                    * self.args.gradient_accumulation_steps,
+                    max_model_len=args.max_length,
+                    distributed_executor_backend="external_launcher",
+                    # Feed identical seed for tp groups to ensure sampling results are the same across workers
+                    seed=self.accelerator.process_index
+                    // self.vllm_tensor_parallel_size,
+                    enable_sleep_mode=self.vllm_enable_sleep_mode,
+                )
+
+                if self.vllm_enable_sleep_mode:
+                    self.vllm_engine.sleep(level=2)
+
+                # When using vLLM, the main process is responsible for loading the model weights. This can cause process
+                # desynchronization and seems to lead to DeepSpeed hanging during initialization. To prevent this, we
+                # synchronize all processes after vLLM has been fully initialized.
+                self.accelerator.wait_for_everyone()
+            else:
+                raise ValueError(f"Unknown vllm_mode: {self.vllm_mode}")
+            self.vllm_guided_decoding_regex = args.vllm_guided_decoding_regex
+            self.vllm_sync_frequency = args.vllm_sync_frequency
+            self._last_vllm_sync_step = -1
+
+            self.add_callback(GOLDVLLMSyncCallback(self))
+
+    def _set_signature_columns_if_needed(self):
+        super()._set_signature_columns_if_needed()
+        required_columns = [
+            "prompts",
+            "prompt_attention_mask",
+            "messages",
+            "chat_template_kwargs",
+            "tools",
+            "original_prompt_text",
+            "original_completion_text",
+        ]
+        if self._signature_columns is None:
+            self._signature_columns = required_columns
+        else:
+            for column in required_columns:
+                if column not in self._signature_columns:
+                    self._signature_columns.append(column)
+
+    def _prepare_dataset(
+        self,
+        dataset: Dataset | IterableDataset,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin,
+        args,
+        packing: bool,
+        formatting_func: Callable[[dict], str] | None,
+        dataset_name: str,
+    ) -> Dataset | IterableDataset:
+        """
+        Override dataset preparation to preserve original text for cross-tokenizer distillation and ensure
+        attention_mask is always added for DataCollatorForChatML compatibility.
+        """
+        # Check if dataset is already processed
+        column_names = list(next(iter(dataset)).keys())
+        is_processed = "input_ids" in column_names
+
+        # Use our enhanced dataset preparation for:
+        # 1. ULD loss with cross-tokenizer (need original text preservation)
+        # 2. Any unprocessed dataset (need attention_mask for DataCollatorForChatML)
+        if not is_processed or (
+            self.use_uld_loss and self.teacher_tokenizer is not None
+        ):
+            # For unprocessed datasets, use our enhanced tokenization
+            return self._prepare_dataset_with_original_text(
+                dataset, processing_class, args, packing, formatting_func, dataset_name
+            )
+
+        # Use parent implementation for all other cases
+        return super()._prepare_dataset(
+            dataset, processing_class, args, packing, formatting_func, dataset_name
+        )
+
+    def _prepare_dataset_with_original_text(
+        self,
+        dataset: Dataset | IterableDataset,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin,
+        args,
+        packing: bool,
+        formatting_func: Callable[[dict], str] | None,
+        dataset_name: str,
+    ) -> Dataset | IterableDataset:
+        """
+        Prepare dataset while preserving original text for cross-tokenizer distillation.
+        """
+        # Build the kwargs for the `map` function
+        map_kwargs = {}
+        if isinstance(dataset, Dataset):  # IterableDataset does not support num_proc
+            map_kwargs["num_proc"] = args.dataset_num_proc
+
+        with PartialState().main_process_first():
+            # Apply the formatting function if any
+            if formatting_func is not None:
+                if isinstance(
+                    dataset, Dataset
+                ):  # `IterableDataset.map` does not support `desc`
+                    map_kwargs["desc"] = (
+                        f"Applying formatting function to {dataset_name} dataset"
+                    )
+
+                def _func(example):
+                    return {"text": formatting_func(example)}
+
+                try:
+                    dataset = dataset.map(_func, batched=False, **map_kwargs)
+                except Exception as e:
+                    warnings.warn(
+                        f"Failed to apply the formatting function due to the following error: {e}. This may be "
+                        "because the function is designed for batched input. Please update it to process one example "
+                        "at a time (i.e., accept and return a single example). For now, we will attempt to apply the "
+                        "function in batched mode, but note that batched formatting is deprecated and will be removed "
+                        "in version 0.21.",
+                        DeprecationWarning,
+                    )
+                    dataset = dataset.map(_func, batched=True, **map_kwargs)
+
+            # Convert the dataset to ChatML if needed
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Converting {dataset_name} dataset to ChatML"
+            column_names = next(iter(dataset)).keys()
+            dataset = dataset.map(
+                maybe_convert_to_chatml,
+                remove_columns="conversations"
+                if "conversations" in column_names
+                else None,
+                **map_kwargs,
+            )
+
+            # Apply the chat template if needed and preserve original text
+            first_example = next(iter(dataset))
+            if not is_conversational(first_example):
+                if isinstance(
+                    dataset, Dataset
+                ):  # `IterableDataset.map` does not support `desc`
+                    map_kwargs["desc"] = f"Adding EOS to {dataset_name} dataset"
+
+                def add_eos(example, eos_token):
+                    if "text" in example and not example["text"].endswith(
+                        eos_token
+                    ):  # language modeling case
+                        example["text"] = example["text"] + eos_token
+                    elif "completion" in example and not example["completion"].endswith(
+                        eos_token
+                    ):
+                        example["completion"] = example["completion"] + eos_token
+                    return example
+
+                dataset = dataset.map(
+                    add_eos,
+                    fn_kwargs={"eos_token": processing_class.eos_token},
+                    remove_columns="messages"
+                    if "messages" in column_names
+                    else None,  # renamed to "text"
+                    **map_kwargs,
+                )
+
+            # Tokenize the dataset while preserving original text
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = (
+                    f"Tokenizing {dataset_name} dataset (preserving original text)"
+                )
+
+            def tokenize_with_original_text(
+                example, processing_class, dataset_text_field, assistant_only_loss
+            ):
+                """Modified tokenization function that preserves original text."""
+                result = {}
+
+                if "prompt" in example:  # prompt-completion case
+                    # Store original text
+                    result["original_prompt_text"] = example["prompt"]
+                    result["original_completion_text"] = example["completion"]
+
+                    if is_conversational(example):
+                        prompt_ids = processing_class.apply_chat_template(
+                            example["prompt"], **example.get("chat_template_kwargs", {})
+                        )
+                        prompt_completion_ids = processing_class.apply_chat_template(
+                            example["prompt"] + example["completion"],
+                            **example.get("chat_template_kwargs", {}),
+                        )
+                    else:
+                        prompt_ids = processing_class(text=example["prompt"]).input_ids
+                        prompt_completion_ids = processing_class(
+                            text=example["prompt"] + example["completion"]
+                        ).input_ids
+
+                    # Check if the tokenized prompt starts with the tokenized prompt+completion
+                    if not prompt_completion_ids[: len(prompt_ids)] == prompt_ids:
+                        warnings.warn(
+                            "Mismatch between tokenized prompt and the start of tokenized prompt+completion. "
+                            "This may be due to unexpected tokenizer behavior, whitespace issues, or special "
+                            "token handling. Verify that the tokenizer is processing text consistently."
+                        )
+
+                    # Create a completion mask
+                    completion_mask = [0] * len(prompt_ids) + [1] * (
+                        len(prompt_completion_ids) - len(prompt_ids)
+                    )
+                    result.update(
+                        {
+                            "input_ids": prompt_completion_ids,
+                            "completion_mask": completion_mask,
+                            "attention_mask": [1]
+                            * len(prompt_completion_ids),  # Add attention mask
+                        }
+                    )
+
+                elif is_conversational(example):
+                    # For conversational data (ChatML), extract prompt and completion properly
+                    messages = example["messages"]
+
+                    # Extract user and assistant messages separately
+                    user_messages = [
+                        msg for msg in messages if msg["role"] != "assistant"
+                    ]
+                    assistant_messages = [
+                        msg for msg in messages if msg["role"] == "assistant"
+                    ]
+
+                    if user_messages and assistant_messages:
+                        # Apply chat template to get the prompt (everything up to assistant)
+                        prompt_text = processing_class.apply_chat_template(
+                            user_messages,
+                            tokenize=False,
+                            add_generation_prompt=True,  # Add assistant prompt
+                            **example.get("chat_template_kwargs", {}),
+                        )
+
+                        # Get the full conversation with assistant response
+                        full_text = processing_class.apply_chat_template(
+                            messages,
+                            tokenize=False,
+                            add_generation_prompt=False,
+                            **example.get("chat_template_kwargs", {}),
+                        )
+
+                        # Extract completion as everything after the prompt
+                        # This ensures we capture any extra tokens (like <think> tags) that the template adds
+                        if full_text.startswith(prompt_text):
+                            completion_text = full_text[len(prompt_text) :]
+                        else:
+                            # Fallback: use assistant content + EOS
+                            assistant_content = assistant_messages[0]["content"]
+                            completion_text = (
+                                assistant_content + processing_class.eos_token
+                                if hasattr(processing_class, "eos_token")
+                                else assistant_content
+                            )
+
+                        # Store original text for cross-tokenizer distillation
+                        result["original_prompt_text"] = prompt_text
+                        result["original_completion_text"] = completion_text
+                    else:
+                        # Fallback: use empty prompt and full text as completion
+                        full_text = processing_class.apply_chat_template(
+                            messages,
+                            tokenize=False,
+                            **example.get("chat_template_kwargs", {}),
+                        )
+                        result["original_prompt_text"] = ""
+                        result["original_completion_text"] = full_text
+
+                    # Process the conversation normally
+                    processed = processing_class.apply_chat_template(
+                        example["messages"],
+                        return_dict=True,
+                        return_assistant_tokens_mask=assistant_only_loss,
+                        **example.get("chat_template_kwargs", {}),
+                    )
+                    if (
+                        "assistant_masks" in processed
+                        and 1 not in processed["assistant_masks"]
+                    ):
+                        raise RuntimeError(
+                            "You're using `assistant_only_loss=True`, but at least one example has no "
+                            "assistant tokens. This usually means the tokenizer's chat template doesn't "
+                            "generate assistant masks — it may be missing the `{% generation %}` tag. Please "
+                            "check the template and ensure it's correctly configured to support assistant "
+                            "masking."
+                        )
+                    result.update(
+                        {
+                            k: processed[k]
+                            for k in ("input_ids", "assistant_masks")
+                            if k in processed
+                        }
+                    )
+                    # Add attention_mask if not already present
+                    if "attention_mask" not in result:
+                        result["attention_mask"] = [1] * len(result["input_ids"])
+                else:
+                    # For regular language modeling, store the full text as completion and empty prompt
+                    result["original_prompt_text"] = ""
+                    result["original_completion_text"] = example.get(
+                        dataset_text_field, example.get("text", "")
+                    )
+
+                    tokenized = processing_class(text=example[dataset_text_field])
+                    result.update(
+                        {
+                            "input_ids": tokenized.input_ids,
+                            "attention_mask": getattr(
+                                tokenized,
+                                "attention_mask",
+                                [1] * len(tokenized.input_ids),
+                            ),
+                        }
+                    )
+
+                return result
+
+            dataset = dataset.map(
+                tokenize_with_original_text,
+                fn_kwargs={
+                    "processing_class": processing_class,
+                    "dataset_text_field": args.dataset_text_field,
+                    "assistant_only_loss": args.assistant_only_loss,
+                },
+                **map_kwargs,
+            )
+
+            # Pack or truncate
+            if packing:
+                if args.max_length is None:
+                    raise ValueError(
+                        "When packing is enabled, `max_length` can't be `None`."
+                    )
+                if isinstance(
+                    dataset, Dataset
+                ):  # `IterableDataset.map` does not support `desc`
+                    map_kwargs["desc"] = f"Packing {dataset_name} dataset"
+
+                columns_to_keep = [
+                    "input_ids",
+                    "original_prompt_text",
+                    "original_completion_text",
+                ]
+                existing_columns = set(dataset.column_names)
+                columns_to_select = [
+                    col for col in columns_to_keep if col in existing_columns
+                ]
+
+                dataset = dataset.select_columns(columns_to_select)
+                dataset = pack_dataset(
+                    dataset, args.max_length, args.packing_strategy, map_kwargs
+                )
+            elif args.max_length is not None:
+                if isinstance(
+                    dataset, Dataset
+                ):  # `IterableDataset.map` does not support `desc`
+                    map_kwargs["desc"] = f"Truncating {dataset_name} dataset"
+                dataset = truncate_dataset(dataset, args.max_length, map_kwargs)
+
+            if args.use_liger_kernel:
+                required_columns = {
+                    "input_ids",
+                    "attention_mask",
+                    "position_ids",
+                    "completion_mask",
+                    "assistant_masks",
+                    "original_prompt_text",
+                    "original_completion_text",
+                }
+                dataset = dataset.select_columns(
+                    required_columns.intersection(dataset.column_names)
+                )
+
+        return dataset
+
+    @staticmethod
+    def generalized_jsd_loss(
+        student_logits,
+        teacher_logits,
+        labels=None,
+        beta=0.5,
+        temperature=1.0,
+        reduction="batchmean",
+        logits_are_probs=False,
+    ):
+        """
+        Compute the generalized Jensen-Shannon Divergence loss for knowledge distillation using F.kl_div. See Eq. (1)
+        of https://huggingface.co/papers/2306.13649 for the definition.
+
+        Args:
+            student_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            teacher_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            labels:
+                Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing
+                loss
+            beta:
+                Interpolation coefficient between 0 and 1 (default: 0.5)
+            temperature:
+                Softmax temperature (default: 1.0)
+            reduction:
+                Specifies the reduction to apply to the output (default: 'batchmean')
+
+        Returns
+        -------
+            loss: Scalar tensor with the generalized JSD loss
+        """
+        if logits_are_probs:
+            student_log_probs = torch.log(student_logits.clamp_min(1e-8))
+            teacher_log_probs = torch.log(teacher_logits.clamp_min(1e-8))
+        else:
+            # Apply temperature scaling to logits before computing probabilities
+            student_logits = student_logits / temperature
+            teacher_logits = teacher_logits / temperature
+            # Compute log probabilities for student and probabilities for teacher
+            student_log_probs = F.log_softmax(student_logits, dim=-1)
+            teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
+
+        if beta == 0:
+            jsd = F.kl_div(
+                student_log_probs, teacher_log_probs, reduction="none", log_target=True
+            )
+        elif beta == 1:
+            jsd = F.kl_div(
+                teacher_log_probs, student_log_probs, reduction="none", log_target=True
+            )
+        else:
+            # Compute the log of the mixture distribution
+            # log(a + b) = log(exp(log(a)) + exp(log(b))) -> for mixture
+            beta = torch.tensor(
+                beta, dtype=student_log_probs.dtype, device=student_log_probs.device
+            )
+            mixture_log_probs = torch.logsumexp(
+                torch.stack(
+                    [
+                        student_log_probs + torch.log1p(-beta),
+                        teacher_log_probs + torch.log(beta),
+                    ]
+                ),
+                dim=0,
+            )
+
+            # Compute KL divergences using F.kl_div
+            # PyTorch differs from the standard mathematical definition, so the order of the probability distributions is swapped compared to that defined in the paper.
+            kl_teacher = F.kl_div(
+                mixture_log_probs, teacher_log_probs, reduction="none", log_target=True
+            )
+            kl_student = F.kl_div(
+                mixture_log_probs, student_log_probs, reduction="none", log_target=True
+            )
+
+            # Compute the Generalized Jensen-Shannon Divergence
+            jsd = beta * kl_teacher + (1 - beta) * kl_student
+
+        # Masking
+        if labels is not None:
+            mask = labels != -100
+            jsd = jsd[mask]
+
+        # Apply reduction
+        if reduction == "batchmean":
+            return (
+                jsd.sum() / mask.sum()
+                if labels is not None
+                else jsd.sum() / jsd.size(0)
+            )
+        if reduction == "sum":
+            return jsd.sum()
+        if reduction == "mean":
+            return jsd.mean()
+        return jsd
+
+    def compute_loss(
+        self, model, inputs, return_outputs=False, num_items_in_batch=None
+    ):
+        if self.use_uld_loss and self.teacher_tokenizer is not None:
+            if (
+                "original_prompt_text" in inputs
+                and "original_completion_text" in inputs
+            ):
+                prompt_texts = inputs["original_prompt_text"]
+                completion_texts = inputs["original_completion_text"]
+                full_texts = [
+                    p + c for p, c in zip(prompt_texts, completion_texts, strict=True)
+                ]
+            else:
+                # Fallback: decode student input_ids (current approach)
+                # WARNING: This may not work perfectly for cross-tokenizer distillation
+                full_sequences = inputs["input_ids"]
+                full_texts = self.processing_class.batch_decode(
+                    full_sequences, skip_special_tokens=False
+                )
+
+                # Try to split prompt/completion using original prompt length
+                prompt_lengths = inputs["prompts"].shape[1]
+                prompt_texts = self.processing_class.batch_decode(
+                    inputs["prompts"], skip_special_tokens=False
+                )
+                completion_texts = [
+                    full.replace(prompt, "", 1)
+                    for full, prompt in zip(full_texts, prompt_texts, strict=True)
+                ]
+
+            (
+                teacher_input_ids,
+                teacher_labels,
+                teacher_attention_mask,
+                teacher_prompt_length,
+            ) = build_teacher_inputs_from_texts(
+                self.teacher_tokenizer,
+                prompt_texts,
+                completion_texts,
+            )
+
+            teacher_input_ids = teacher_input_ids.to(self.accelerator.device)
+            teacher_labels = teacher_labels.to(self.accelerator.device)
+            teacher_attention_mask = teacher_attention_mask.to(self.accelerator.device)
+
+            outputs_student = model(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                use_cache=False,
+            )
+
+            self.teacher_model.eval()
+            with torch.no_grad():
+                outputs_teacher = self.teacher_model(
+                    input_ids=teacher_input_ids,
+                    attention_mask=teacher_attention_mask,
+                )
+
+            # These are not used for ULD loss but are needed if JSD loss were to be used in this branch
+            student_prompt_length = inputs["prompts"].shape[1]
+            shifted_student_logits = outputs_student.logits[
+                :, student_prompt_length - 1 : -1, :
+            ]
+            shifted_teacher_logits = outputs_teacher.logits[
+                :, teacher_prompt_length - 1 : -1, :
+            ]
+            shifted_labels = inputs["labels"][:, student_prompt_length:]
+        elif self.use_liger_gkd_loss:
+            # Forward only through the base models (avoid lm_head to save memory)
+            unwrapped_student = self.accelerator.unwrap_model(model)
+            if (
+                hasattr(unwrapped_student, "get_decoder")
+                and unwrapped_student.get_decoder() is not None
+            ):
+                base_student = unwrapped_student.get_decoder()
+            else:
+                base_student = getattr(
+                    unwrapped_student,
+                    getattr(unwrapped_student, "base_model_prefix", "model"),
+                    unwrapped_student,
+                )
+
+            student_outputs = base_student(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                use_cache=False,
+            )
+
+            self.teacher_model.eval()
+            unwrapped_teacher = self.accelerator.unwrap_model(self.teacher_model)
+            if (
+                hasattr(unwrapped_teacher, "get_decoder")
+                and unwrapped_teacher.get_decoder() is not None
+            ):
+                base_teacher = unwrapped_teacher.get_decoder()
+            else:
+                base_teacher = getattr(
+                    unwrapped_teacher,
+                    getattr(unwrapped_teacher, "base_model_prefix", "model"),
+                    unwrapped_teacher,
+                )
+            with torch.no_grad():
+                teacher_outputs = base_teacher(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                    use_cache=False,
+                )
+
+            # hidden states (shifted)
+            student_hidden = student_outputs.last_hidden_state[:, :-1]
+            teacher_hidden = teacher_outputs.last_hidden_state[:, :-1]
+
+            # Release full outputs to free memory
+            del student_outputs, teacher_outputs
+
+            # labels mask and labels (shifted)
+            labels_mask = inputs["labels"] != -100
+            masked_input_ids = torch.where(
+                labels_mask,
+                inputs["input_ids"],
+                torch.full_like(inputs["input_ids"], -100),
+            )
+            true_labels = masked_input_ids[:, 1:].contiguous()
+
+            # heads
+            student_head = unwrapped_student.get_output_embeddings()
+            teacher_head = unwrapped_teacher.get_output_embeddings()
+
+            # liger fused jsd loss
+            loss = self.liger_jsd_loss(
+                student_input=student_hidden,
+                student_weight=student_head.weight,
+                teacher_input=teacher_hidden,
+                teacher_weight=teacher_head.weight,
+                true_labels=true_labels,
+                student_bias=getattr(student_head, "bias", None),
+                teacher_bias=getattr(teacher_head, "bias", None),
+            )
+
+            # Release hidden states after loss computation
+            del student_hidden, teacher_hidden, true_labels
+        else:
+            # Original behavior for same tokenizer or when teacher_tokenizer is not provided
+            outputs_student = model(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+            )
+
+            self.teacher_model.eval()
+            with torch.no_grad():
+                outputs_teacher = self.teacher_model(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                )
+
+            prompt_lengths = inputs["prompts"].shape[1]
+            shifted_student_logits = outputs_student.logits[
+                :, prompt_lengths - 1 : -1, :
+            ]
+            shifted_teacher_logits = outputs_teacher.logits[
+                :, prompt_lengths - 1 : -1, :
+            ]
+            shifted_labels = inputs["labels"][:, prompt_lengths:]
+            loss = self.generalized_jsd_loss(
+                student_logits=shifted_student_logits,
+                teacher_logits=shifted_teacher_logits,
+                labels=shifted_labels,
+                beta=self.beta,
+            )
+
+        if self.use_uld_loss:
+            student_input_ids = inputs["input_ids"]
+
+            # Use the *teacher* labels created above, not the student's.
+            teacher_labels_for_loss = (
+                teacher_labels if "teacher_labels" in locals() else inputs["labels"]
+            )
+            teacher_input_ids_for_loss = (
+                teacher_input_ids
+                if "teacher_input_ids" in locals()
+                else inputs["input_ids"]
+            )
+
+            # Create properly masked student labels (fixing batch size > 1 issue)
+            student_labels = inputs["labels"].clone()
+            if (
+                hasattr(self.processing_class, "pad_token_id")
+                and self.processing_class.pad_token_id is not None
+            ):
+                student_labels[
+                    student_labels == self.processing_class.pad_token_id
+                ] = -100
+
+            # Also mask pad tokens in teacher labels for consistency
+            if (
+                hasattr(self, "teacher_tokenizer")
+                and hasattr(self.teacher_tokenizer, "pad_token_id")
+                and self.teacher_tokenizer.pad_token_id is not None
+            ):
+                teacher_labels[
+                    teacher_labels == self.teacher_tokenizer.pad_token_id
+                ] = -100
+
+            loss = self.uld_loss_fn(
+                student_logits=outputs_student.logits,
+                teacher_logits=outputs_teacher.logits,
+                student_labels=student_labels,
+                teacher_labels=teacher_labels_for_loss,
+                student_input_ids=student_input_ids,
+                teacher_input_ids=teacher_input_ids_for_loss,
+            )
+
+            # If ULD hybrid mode produced per-step matched/unmatched components, accumulate them for logging.
+            # Use gradient_accumulation_steps to mirror Trainer's windowing behavior.
+            if hasattr(self.uld_loss_fn, "last_matched_loss") and hasattr(
+                self.uld_loss_fn, "last_unmatched_loss"
+            ):
+                try:
+                    ga = max(1, int(self.args.gradient_accumulation_steps))
+                except Exception:
+                    ga = 1
+                step_eq = 1.0 / ga
+                # read scalar values for logging
+                matched_val = (
+                    self.uld_loss_fn.last_matched_loss.item()
+                    if self.uld_loss_fn.last_matched_loss is not None
+                    else 0.0
+                )
+                unmatched_val = (
+                    self.uld_loss_fn.last_unmatched_loss.item()
+                    if self.uld_loss_fn.last_unmatched_loss is not None
+                    else 0.0
+                )
+
+                self._matched_sum += matched_val
+                self._unmatched_sum += unmatched_val
+                self._matched_step_eq += step_eq
+                self._unmatched_step_eq += step_eq
+
+        empty_cache()
+
+        return (loss, outputs_student) if return_outputs else loss
+
+    def generate_on_policy_outputs(
+        self, model, inputs, generation_config, pad_token_id=None
+    ):
+        # Generate output with respect to the prompt only
+        if self.use_transformers_paged:
+            previous_attn = self.model.config._attn_implementation
+            if is_flash_attn_2_available():
+                model.config._attn_implementation = "paged_attention"
+            else:
+                model.config._attn_implementation = "sdpa_paged"
+            prompt_mask = inputs.get("prompt_attention_mask")
+            prompts_tensor = inputs["prompts"]
+            if prompt_mask is not None:
+                prompt_sequences = [
+                    row[mask.bool()].detach().cpu().tolist()
+                    for row, mask in zip(prompts_tensor, prompt_mask, strict=True)
+                ]
+            else:
+                prompt_sequences = [
+                    row.detach().cpu().tolist() for row in prompts_tensor
+                ]
+            generated_outputs = model.generate_batch(
+                prompt_sequences, generation_config=generation_config
+            )
+            model.config._attn_implementation = previous_attn
+
+            completion_ids = [
+                output.generated_tokens for output in generated_outputs.values()
+            ]
+            generated_tokens = torch.stack(
+                [torch.tensor(ids, device=model.device) for ids in completion_ids]
+            )
+        else:
+            generated_outputs = model.generate(
+                input_ids=inputs["prompts"],
+                attention_mask=inputs.get("prompt_attention_mask", None),
+                generation_config=generation_config,
+                return_dict_in_generate=True,
+            )
+            # Get the generated token IDs
+            generated_tokens = generated_outputs.sequences
+
+        batch_size = generated_tokens.size(0)
+        device = generated_tokens.device
+
+        prompt_mask = inputs.get("prompt_attention_mask")
+        pad_token_id = (
+            pad_token_id
+            if pad_token_id is not None
+            else self.processing_class.pad_token_id
+        )
+
+        if prompt_mask is not None:
+            prompt_lengths = prompt_mask.sum(dim=1).to(torch.long)
+        elif pad_token_id is not None:
+            prompt_lengths = (
+                (inputs["prompts"] != pad_token_id).sum(dim=1).to(torch.long)
+            )
+        else:
+            prompt_lengths = torch.full(
+                (batch_size,),
+                inputs["prompts"].shape[1],
+                dtype=torch.long,
+                device=device,
+            )
+
+        new_input_ids = generated_tokens
+        new_attention_mask = torch.ones_like(new_input_ids)
+        if pad_token_id is not None:
+            new_attention_mask[new_input_ids == pad_token_id] = 0
+
+        new_labels = torch.full_like(new_input_ids, -100)
+        for idx in range(batch_size):
+            length = int(prompt_lengths[idx].item())
+            new_labels[idx, length:] = new_input_ids[idx, length:]
+
+        if pad_token_id is not None:
+            new_labels[new_input_ids == pad_token_id] = -100
+
+        prompt_texts = []
+        completion_texts = []
+        for idx in range(batch_size):
+            length = int(prompt_lengths[idx].item())
+            prompt_tokens = inputs["prompts"][idx]
+            if prompt_mask is not None:
+                prompt_tokens = prompt_tokens[prompt_mask[idx].bool()]
+            elif pad_token_id is not None:
+                prompt_tokens = prompt_tokens[prompt_tokens != pad_token_id]
+            prompt_texts.append(
+                self.processing_class.decode(
+                    prompt_tokens.tolist(),
+                    skip_special_tokens=False,
+                    clean_up_tokenization_spaces=False,
+                )
+            )
+            completion_tokens = new_input_ids[idx, length:]
+            completion_texts.append(
+                self.processing_class.decode(
+                    completion_tokens.tolist(),
+                    skip_special_tokens=False,
+                    clean_up_tokenization_spaces=False,
+                )
+            )
+
+        return (
+            new_input_ids,
+            new_attention_mask,
+            new_labels,
+            prompt_texts,
+            completion_texts,
+        )
+
+    @profiling_decorator
+    def _generate_on_policy_outputs_vllm(
+        self, inputs, generation_config, pad_token_id=None
+    ):
+        device = self.accelerator.device
+
+        # Decode prompts for vLLM (without special tokens - vLLM expects clean text)
+        prompts_text_for_vllm = self.processing_class.batch_decode(
+            inputs["prompts"],
+            skip_special_tokens=True,
+            # clean_up_tokenization_spaces=False # Keep this commented unless specific issues arise
+        )
+        # Remove padding token text if it appears, as vLLM expects clean prompts
+        if self.processing_class.pad_token:
+            prompts_text_for_vllm = [
+                p.replace(self.processing_class.pad_token, "")
+                for p in prompts_text_for_vllm
+            ]
+
+        # Also decode prompts WITH special tokens for ULD loss computation
+        prompts_text_with_special = self.processing_class.batch_decode(
+            inputs["prompts"],
+            skip_special_tokens=False,
+        )
+
+        # system_prompt = "Please reason step by step, and put your final answer within \\boxed{}."
+        # target_system_prompt = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
+        # prompts_text = [p.replace(target_system_prompt, system_prompt) for p in prompts_text]
+        # Add system prompt to prompts
+
+        max_completion_length = generation_config.max_completion_length
+        temperature = generation_config.temperature
+        # vLLM uses top_k=-1 for no top_k, transformers uses 0 or None.
+        top_k = (
+            generation_config.top_k
+            if generation_config.top_k and generation_config.top_k > 0
+            else -1
+        )
+        # top_p, repetition_penalty, min_p are not directly in generation_config, get from trainer args
+        top_p = self.args.top_p if hasattr(self.args, "top_p") else 1.0
+        repetition_penalty = (
+            self.args.repetition_penalty
+            if hasattr(self.args, "repetition_penalty")
+            else 1.0
+        )
+        min_p = self.args.min_p if hasattr(self.args, "min_p") else 0.0
+
+        if self.vllm_mode == "server":
+            all_prompts_text = gather_object(prompts_text_for_vllm)
+            if self.accelerator.is_main_process:
+                completion_ids = self.vllm_client.generate(
+                    prompts=all_prompts_text,
+                    n=1,  # In GKD, we generate 1 completion per prompt from student
+                    repetition_penalty=repetition_penalty,
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k,
+                    min_p=min_p,
+                    max_tokens=max_completion_length,
+                    guided_decoding_regex=self.vllm_guided_decoding_regex,
+                )
+            else:
+                completion_ids = [None] * len(all_prompts_text)
+            completion_ids = broadcast_object_list(completion_ids, from_process=0)
+            process_slice = slice(
+                self.accelerator.process_index * len(prompts_text_for_vllm),
+                (self.accelerator.process_index + 1) * len(prompts_text_for_vllm),
+            )
+            completion_ids = completion_ids[process_slice]
+        elif self.vllm_mode == "colocate":
+            if self.vllm_guided_decoding_regex:
+                guided_decoding = GuidedDecodingParams(
+                    backend="outlines", regex=self.vllm_guided_decoding_regex
+                )
+            else:
+                guided_decoding = None
+            sampling_params = SamplingParams(
+                n=1,
+                repetition_penalty=repetition_penalty,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                max_tokens=max_completion_length,
+                guided_decoding=guided_decoding,
+            )
+
+            if hasattr(self, "vllm_tp_group") and self.vllm_tensor_parallel_size > 1:
+                # Gather prompts from all ranks in the TP group and flatten.
+                # Each rank starts with its own prompts; after gathering, all ranks see the full group set.
+                orig_size = len(prompts_text_for_vllm)
+                gathered_prompts = [None for _ in range(self.vllm_tensor_parallel_size)]
+                torch.distributed.all_gather_object(
+                    gathered_prompts, prompts_text_for_vllm, group=self.vllm_tp_group
+                )
+                all_prompts_text = [p for sublist in gathered_prompts for p in sublist]
+            else:
+                all_prompts_text = prompts_text_for_vllm
+
+            all_outputs = self.vllm_engine.generate(
+                all_prompts_text, sampling_params=sampling_params, use_tqdm=False
+            )
+            completion_ids = [
+                output.token_ids
+                for outputs in all_outputs
+                for output in outputs.outputs
+            ]
+
+            if hasattr(self, "vllm_tp_group") and self.vllm_tensor_parallel_size > 1:
+                # Slice completions for this rank within its TP group.
+                # Each rank generates all outputs — we keep only our share.
+                local_rank_in_group = torch.distributed.get_rank(
+                    group=self.vllm_tp_group
+                )
+                tp_slice = slice(
+                    local_rank_in_group * orig_size,
+                    (local_rank_in_group + 1) * orig_size,
+                )
+                completion_ids = completion_ids[tp_slice]
+
+            if self.vllm_enable_sleep_mode:
+                self.vllm_engine.sleep(level=2)
+        else:
+            raise ValueError(f"Unknown vllm_mode: {self.vllm_mode}")
+
+        # We need to combine prompt and completion for new_input_ids
+        # Tokenize prompts again to get prompt_ids on the correct device and format
+        # Use prompts_text_for_vllm (without special tokens) for tokenization since vLLM expects clean text
+        # Ensure add_special_tokens=False as vLLM typically handles prompts as raw text
+        # Calculate max_length for prompts, ensuring it's positive
+        prompt_max_length = (
+            max(1, self.args.max_length - max_completion_length)
+            if self.args.max_length
+            else None
+        )
+        prompt_tokenized = self.processing_class(
+            prompts_text_for_vllm,
+            return_tensors="pt",
+            padding="longest",
+            truncation=True if prompt_max_length else False,
+            max_length=prompt_max_length,
+            add_special_tokens=False,
+        ).to(device)
+        prompt_ids = prompt_tokenized.input_ids
+
+        completion_ids_tensors = [
+            torch.tensor(ids, device=device) for ids in completion_ids
+        ]
+        # Manually pad/truncate completions to max_completion_length length before using pad function
+        padded_completion_ids_list = []
+        for completion_tensor in completion_ids_tensors:
+            if len(completion_tensor) > max_completion_length:
+                # Truncate if longer than max_completion_length
+                padded_completion_ids_list.append(
+                    completion_tensor[:max_completion_length]
+                )
+            elif len(completion_tensor) < max_completion_length:
+                # Pad if shorter than max_completion_length
+                padding_needed = max_completion_length - len(completion_tensor)
+                padded_tensor = torch.cat(
+                    [
+                        completion_tensor,
+                        torch.full(
+                            (padding_needed,),
+                            pad_token_id,
+                            device=device,
+                            dtype=completion_tensor.dtype,
+                        ),
+                    ]
+                )
+                padded_completion_ids_list.append(padded_tensor)
+            else:
+                # Already the right length
+                padded_completion_ids_list.append(completion_tensor)
+
+        # Now all tensors are the same length, so we can stack them
+        padded_completion_ids = torch.stack(padded_completion_ids_list)
+
+        # Ensure prompt_ids and padded_completion_ids are 2D
+        if prompt_ids.ndim == 1:
+            prompt_ids = prompt_ids.unsqueeze(0)
+        if padded_completion_ids.ndim == 1:
+            padded_completion_ids = padded_completion_ids.unsqueeze(0)
+
+        new_input_ids = torch.cat([prompt_ids, padded_completion_ids], dim=1)
+
+        new_attention_mask = torch.ones_like(new_input_ids, device=device)
+        new_labels = new_input_ids.clone()
+
+        if pad_token_id is not None:
+            new_labels[new_labels == pad_token_id] = -100
+            new_attention_mask[new_input_ids == pad_token_id] = 0
+
+        # Mask prompt tokens in labels
+        prompt_lengths = prompt_ids.shape[1]
+        new_labels[:, :prompt_lengths] = -100
+
+        # IMPORTANT: Preserve original text for cross-tokenizer ULD loss
+        # Use prompts_text_with_special (with special tokens) for ULD loss computation
+        # Extract completion texts from the generated completion IDs
+        completion_texts = []
+        for comp_ids in completion_ids:
+            completion_text = self.processing_class.decode(
+                comp_ids, skip_special_tokens=False
+            )
+            completion_texts.append(completion_text)
+
+        return (
+            new_input_ids,
+            new_attention_mask,
+            new_labels,
+            prompts_text_with_special,
+            completion_texts,
+        )
+
+    def _sync_fsdp_params_to_vllm(
+        self, module: nn.Module, prefix: str = "", visited=None
+    ):
+        """Memory-efficient post-order traversal of FSDP modules to extract full parameters and sync with student vLLM."""
+        if visited is None:
+            visited = set()
+
+        for child_name, child_module in module.named_children():
+            child_prefix = f"{prefix}.{child_name}" if prefix else child_name
+            # recurse into the child
+            self._sync_fsdp_params_to_vllm(
+                child_module, prefix=child_prefix, visited=visited
+            )
+
+        if isinstance(module, FSDP):
+            with FSDP.summon_full_params(module, recurse=False, writeback=False):
+                for param_name, param in module.named_parameters():
+                    full_name = f"{prefix}.{param_name}" if prefix else param_name
+                    for extra in (
+                        "_fsdp_wrapped_module.",
+                        "_checkpoint_wrapped_module.",
+                    ):
+                        full_name = full_name.replace(extra, "")
+
+                    if full_name in visited:
+                        continue  # skip FSDP subtrees already traversed
+                    visited.add(full_name)
+
+                    if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(full_name, param.data)
+                    elif self.vllm_mode == "colocate":
+                        llm_model = self.vllm_engine.llm_engine.model_executor.driver_worker.model_runner.model
+                        llm_model.load_weights([(full_name, param.data)])
+
+    def _move_model_to_vllm(self):
+        """Synchronize student model weights to vLLM engine."""
+        # For DeepSpeed ZeRO-3 and FSDP, we need to gather all parameters before operations
+        deepspeed_plugin = self.accelerator.state.deepspeed_plugin
+        zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
+        if zero_stage_3:
+            import deepspeed
+
+            gather_if_zero3 = deepspeed.zero.GatheredParameters
+        else:
+            gather_if_zero3 = nullcontext
+
+        if self.vllm_mode == "colocate" and self.vllm_enable_sleep_mode:
+            empty_cache()
+            self.vllm_engine.wake_up(tags=["weights"])
+
+        if is_peft_model(self.model):
+            # With PEFT and FSDP/DeepSpeed ZeRO Stage 3, we must gather the full model at once before merging, as
+            # merging adapters in a sharded manner is not supported.
+            with gather_if_zero3(list(self.model.parameters())):
+                self.model.merge_adapter()
+
+                # Update vLLM weights while parameters are gathered
+                if (
+                    self.is_fsdp_enabled
+                ):  # note if using FSDP, gather_if_zero3 is nullcontext
+                    # Update vLLM weights while parameters are gathered
+                    # For PEFT with FSDP we need to use the memory efficient post-order traversal
+                    self._sync_fsdp_params_to_vllm(self.model)
+                else:
+                    # DeepSpeed ZeRO-3 with PEFT
+                    for name, param in self.model.named_parameters():
+                        # When using PEFT, we need to recover the original parameter name and discard some parameters
+                        name = name.removeprefix("base_model.model.").replace(
+                            ".base_layer", ""
+                        )
+                        if self.model.prefix in name:
+                            continue
+                        # When module to save, remove its prefix and discard the original module
+                        if "original_module" in name:
+                            continue
+                        name = name.replace("modules_to_save.default.", "")
+
+                        if (
+                            self.vllm_mode == "server"
+                            and self.accelerator.is_main_process
+                        ):
+                            self.vllm_client.update_named_param(name, param.data)
+                        elif self.vllm_mode == "colocate":
+                            llm_model = self.vllm_engine.llm_engine.model_executor.driver_worker.model_runner.model
+                            llm_model.load_weights([(name, param.data)])
+                # Unmerge adapters while parameters are still gathered
+                self.model.unmerge_adapter()
+                # Parameters will automatically be repartitioned when exiting the context
+        # For non-PEFT models, simply gather (if needed) and update each parameter individually.
+        elif self.is_fsdp_enabled:
+            # use memory-efficient post-order traversal for FSDP
+            self._sync_fsdp_params_to_vllm(self.model)
+        else:
+            # For DeepSpeed ZeRO-3, gather each parameter individually like GRPO trainer
+            for name, param in self.model.named_parameters():
+                with gather_if_zero3([param]):
+                    if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(name, param.data)
+                    elif self.vllm_mode == "colocate":
+                        llm_model = self.vllm_engine.llm_engine.model_executor.driver_worker.model_runner.model
+                        llm_model.load_weights([(name, param.data)])
+
+        # Reset cache on vLLM
+        if self.vllm_mode == "server" and self.accelerator.is_main_process:
+            self.vllm_client.reset_prefix_cache()
+        elif self.vllm_mode == "colocate":
+            self.vllm_engine.reset_prefix_cache()
+
+    def _wake_vllm_if_needed(self):
+        if self.vllm_mode == "colocate" and self.vllm_enable_sleep_mode:
+            empty_cache()
+            self.vllm_engine.wake_up(tags=["kv_cache"])
+
+    @profiling_decorator
+    def training_step(
+        self,
+        model: nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        num_items_in_batch: int | None = None,
+    ) -> torch.Tensor:
+        """
+        Perform a training step for the General Online Logit Distillation (GOLD) model.
+
+        This method implements the on-policy learning approach described in the GOLD blog post. With probability
+        `self.lmbda`, it generates new responses using the student model, which are then used for training instead of
+        the offline original inputs.
+        """
+        on_policy = False
+        if random.random() <= self.lmbda:
+            on_policy = True
+            if self.use_vllm:
+                self._wake_vllm_if_needed()
+                result = self._generate_on_policy_outputs_vllm(
+                    inputs, self.generation_config, self.processing_class.pad_token_id
+                )
+                (
+                    new_input_ids,
+                    new_attention_mask,
+                    new_labels,
+                    prompt_texts,
+                    completion_texts,
+                ) = result
+            else:
+                with unwrap_model_for_generation(
+                    model, self.accelerator
+                ) as unwrapped_model:
+                    result = self.generate_on_policy_outputs(
+                        unwrapped_model,
+                        inputs,
+                        self.generation_config,
+                        self.processing_class.pad_token_id,
+                    )
+                    (
+                        new_input_ids,
+                        new_attention_mask,
+                        new_labels,
+                        prompt_texts,
+                        completion_texts,
+                    ) = result
+
+            inputs["input_ids"] = new_input_ids
+            inputs["attention_mask"] = new_attention_mask
+            inputs["labels"] = new_labels
+
+            # CRITICAL: Preserve original text for cross-tokenizer ULD loss
+            # This ensures both off-policy (dataset) and on-policy (generated) samples
+            # can use proper text-based alignment for different tokenizers
+            inputs["original_prompt_text"] = prompt_texts
+            inputs["original_completion_text"] = completion_texts
+
+            # Log prompt and completion texts
+            self._textual_logs["prompt"].extend(gather_object(prompt_texts))
+            self._textual_logs["completion"].extend(gather_object(completion_texts))
+
+        loss = super().training_step(model, inputs, num_items_in_batch)
+
+        loss_scalar = float(loss.detach())
+        ga = max(1, int(self.args.gradient_accumulation_steps))
+        step_equiv = 1.0 / ga
+
+        if on_policy:
+            self._on_policy_loss_total += loss_scalar
+            self._on_policy_step_equiv += step_equiv
+        else:
+            self._off_policy_loss_total += loss_scalar
+            self._off_policy_step_equiv += step_equiv
+        return loss
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        mode = "train" if self.model.training else "eval"
+        metrics = {
+            key: sum(val) / len(val) for key, val in self._metrics[mode].items()
+        }  # average the metrics
+
+        if mode == "train":
+            device = (
+                self.accelerator.device
+                if hasattr(self.accelerator, "device")
+                else torch.device("cpu")
+            )
+            # include matched/unmatched accumulators for distributed reduction
+            vec = torch.tensor(
+                [
+                    self._on_policy_loss_total,
+                    self._off_policy_loss_total,
+                    self._on_policy_step_equiv,
+                    self._off_policy_step_equiv,
+                    self._matched_sum,
+                    self._unmatched_sum,
+                    self._matched_step_eq,
+                    self._unmatched_step_eq,
+                ],
+                dtype=torch.float64,
+                device=device,
+            )
+
+            # Sum across processes so we mirror Trainer's distributed reduction
+            if (
+                getattr(self.accelerator, "distributed_type", DistributedType.NO)
+                != DistributedType.NO
+                and dist.is_available()
+                and dist.is_initialized()
+            ):
+                dist.all_reduce(vec, op=dist.ReduceOp.SUM)
+
+            (
+                on_sum,
+                off_sum,
+                on_eq,
+                off_eq,
+                matched_sum,
+                unmatched_sum,
+                matched_eq,
+                unmatched_eq,
+            ) = vec.tolist()
+
+            # Compute category averages over the *same window* as Trainer's logs
+            # (avoid div-by-zero if, e.g., no on-policy steps in the window)
+            if on_eq > 0:
+                logs["on_policy_loss"] = round(on_sum / on_eq, 4)
+            if off_eq > 0:
+                logs["off_policy_loss"] = round(off_sum / off_eq, 4)
+
+            # matched/unmatched averaged over same logging window (if present)
+            if matched_eq > 0:
+                logs["matched_loss"] = round(matched_sum / matched_eq, 4)
+            if unmatched_eq > 0:
+                logs["unmatched_loss"] = round(unmatched_sum / unmatched_eq, 4)
+
+            # Reset window accumulators after logging (just like Trainer resets its window)
+            self._on_policy_loss_total = self._off_policy_loss_total = 0.0
+            self._on_policy_step_equiv = self._off_policy_step_equiv = 0.0
+            self._matched_sum = self._unmatched_sum = 0.0
+            self._matched_step_eq = self._unmatched_step_eq = 0.0
+
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if mode == "eval":
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+
+        logs = {**logs, **metrics}
+        super().log(logs, start_time)
+        self._metrics[mode].clear()
+
+        if (
+            self.accelerator.is_main_process
+            and self.log_completions
+            and ((self.state.global_step % self.log_completion_steps) == 0)
+        ):
+            if is_rich_available():
+                print_prompt_completions_sample_uld(
+                    self._textual_logs["prompt"],
+                    self._textual_logs["completion"],
+                    self.state.global_step,
+                    self.num_completions_to_print,
+                )
+
+            if (
+                self.args.report_to
+                and "wandb" in self.args.report_to
+                and wandb.run is not None
+            ):
+                import pandas as pd
+
+                table = {
+                    "step": [str(self.state.global_step)]
+                    * len(self._textual_logs["prompt"]),
+                    "prompt": self._textual_logs["prompt"],
+                    "completion": self._textual_logs["completion"],
+                }
+                df = pd.DataFrame(table)
+                if self.wandb_log_unique_prompts:
+                    df = df.drop_duplicates(subset=["prompt"])
+                if self.num_completions_to_print and len(df) > 0:
+                    df = df.sample(n=self.num_completions_to_print, random_state=42)
+                wandb.log({"completions": wandb.Table(dataframe=df)})
diff --git a/src/aixpert/training/training/trl/experimental/grpo_with_replay_buffer/__init__.py b/src/aixpert/training/training/trl/experimental/grpo_with_replay_buffer/__init__.py
new file mode 100644
index 0000000..6cf7ae5
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/grpo_with_replay_buffer/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .grpo_with_replay_buffer_config import GRPOWithReplayBufferConfig
+from .grpo_with_replay_buffer_trainer import GRPOWithReplayBufferTrainer, ReplayBuffer
diff --git a/src/aixpert/training/training/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_config.py b/src/aixpert/training/training/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_config.py
new file mode 100644
index 0000000..6f0b038
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_config.py
@@ -0,0 +1,34 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from ...trainer.grpo_config import GRPOConfig
+
+
+@dataclass
+class GRPOWithReplayBufferConfig(GRPOConfig):
+    """
+    New Parameters:
+        replay_buffer_size (`int`, *optional*, defaults to `0`):
+                A cache that stores the rollouts with the highest advantage scores and variance per group. If a new
+                group has 0 variance, it is replaced with a group sampled from the replay buffer.
+    """
+
+    replay_buffer_size: int = field(
+        default=64,
+        metadata={
+            "help": "A cache that stores the rollouts with the highest advantage scores and variance per group. If a new group has 0 variance, it is replaced with a group sampled from the replay buffer."
+        },
+    )
diff --git a/src/aixpert/training/training/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_trainer.py b/src/aixpert/training/training/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_trainer.py
new file mode 100644
index 0000000..cbc7d07
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/grpo_with_replay_buffer/grpo_with_replay_buffer_trainer.py
@@ -0,0 +1,861 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import heapq
+from typing import Any
+
+import torch
+from accelerate.utils import gather_object
+
+from ...data_utils import (
+    apply_chat_template,
+    is_conversational,
+    prepare_multimodal_messages,
+)
+from ...trainer.grpo_trainer import GRPOTrainer
+from ...trainer.utils import nanmax, nanmin, nanstd, pad
+from .grpo_with_replay_buffer_config import GRPOWithReplayBufferConfig
+
+
+class ReplayBuffer:
+    """
+    A simple replay buffer to store and sample previously seen rollouts.
+    """
+
+    def __init__(self, max_size: int):
+        self.max_size = max_size
+        self.heap = []  # Min-heap of (score, data) tuples
+
+    def add(self, scores: list[float], data: list[dict]):
+        for score, datum in zip(scores, data, strict=True):
+            if len(self.heap) < self.max_size:
+                heapq.heappush(self.heap, (score, datum))
+            # Only add if score is better than worst (minimum) item
+            elif score > self.heap[0][0]:
+                heapq.heapreplace(self.heap, (score, datum))
+
+    def sample(self, num_samples: int) -> list[dict[str, torch.Tensor]]:
+        if not self.heap:
+            return None
+
+        # Sample by normalized scores
+        scores = torch.tensor([item[0] for item in self.heap], dtype=torch.float32)
+        probabilities = scores / scores.sum()
+        replacement = False
+        if num_samples > len(self.heap):
+            replacement = True
+        chosen_indices = torch.multinomial(
+            probabilities, num_samples, replacement=replacement
+        ).tolist()
+        return [self.heap[i][1] for i in chosen_indices]
+
+
+class GRPOWithReplayBufferTrainer(GRPOTrainer):
+    def __init__(self, args: GRPOWithReplayBufferConfig | None = None, **kwargs):
+        super().__init__(args=args, **kwargs)
+        self.replay_buffer = (
+            ReplayBuffer(args.replay_buffer_size)
+            if args.replay_buffer_size > 0
+            else None
+        )
+
+    def _generate_and_score_completions(
+        self, inputs: list[dict[str, torch.Tensor | Any]]
+    ) -> dict[str, torch.Tensor | Any]:
+        device = self.accelerator.device
+        mode = "train" if self.model.training else "eval"
+
+        prompts = [x["prompt"] for x in inputs]
+
+        if "images" in inputs[0]:
+            images = [example.get("images") for example in inputs]
+        elif "image" in inputs[0]:
+            images = [
+                [example.get("image")] if example.get("image") is not None else None
+                for example in inputs
+            ]
+        else:
+            images = None
+        # Transformers requires at least one image in the batch, otherwise it throws an error
+        if images is not None and all(img_list == [] for img_list in images):
+            images = None
+
+        # If the prompts are conversational and the inputs contain images, we need to convert the prompts from
+        # [{"role": "user", "content": "What color is the sky?"}] to
+        # [{"role": "user", "content": [{"type": "image", "image": <Image>}, {"type": "text", "text": "What color is the sky?"}]}]
+        if images is not None:
+            prompts = [
+                prepare_multimodal_messages(prompt, image_list)
+                for prompt, image_list in zip(prompts, images, strict=True)
+            ]
+
+        (
+            prompt_ids_list,
+            completion_ids_list,
+            num_items_in_batch,
+            sampling_per_token_logps_list,
+            extra_fields,
+        ) = self._generate(prompts)
+
+        # Convert lists of token IDs to padded tensors
+        prompt_ids = [torch.tensor(ids, device=device) for ids in prompt_ids_list]
+        prompt_mask = [torch.ones_like(ids, dtype=torch.long) for ids in prompt_ids]
+        prompt_ids = pad(
+            prompt_ids, padding_value=self.pad_token_id, padding_side="left"
+        )
+        prompt_mask = pad(prompt_mask, padding_value=0, padding_side="left")
+        completion_ids = [
+            torch.tensor(ids, device=device) for ids in completion_ids_list
+        ]
+        completion_mask = [
+            torch.ones_like(ids, dtype=torch.long) for ids in completion_ids
+        ]
+        completion_ids = pad(
+            completion_ids, padding_value=self.pad_token_id, padding_side="right"
+        )
+        completion_mask = pad(completion_mask, padding_value=0, padding_side="right")
+        if sampling_per_token_logps_list is not None:
+            sampling_per_token_logps = [
+                torch.tensor(logps, device=device)
+                for logps in sampling_per_token_logps_list
+            ]
+            sampling_per_token_logps = pad(
+                sampling_per_token_logps, padding_value=0.0, padding_side="right"
+            )
+        else:
+            sampling_per_token_logps = None
+
+        # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
+        if self.mask_truncated_completions:
+            eos_and_pad = [self.eos_token_id, self.pad_token_id]
+            is_truncated = torch.tensor(
+                [ids[-1] not in eos_and_pad for ids in completion_ids_list],
+                device=device,
+            )
+            completion_mask = completion_mask * (~is_truncated).unsqueeze(1).int()
+
+        # Concatenate prompt_mask with completion_mask for logit computation
+        prompt_completion_ids = torch.cat(
+            [prompt_ids, completion_ids], dim=1
+        )  # (B, P+C)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
+
+        logits_to_keep = completion_ids.size(
+            1
+        )  # we only need to compute the logits for the completion tokens
+        batch_size = (
+            self.args.per_device_train_batch_size
+            if mode == "train"
+            else self.args.per_device_eval_batch_size
+        )
+
+        num_images = (
+            [len(img_list) for img_list in images] if images is not None else None
+        )
+
+        # Get forward_kwargs for models with multimodal inputs
+        if images is not None:
+            prompts_text = [
+                apply_chat_template(
+                    {"prompt": prompt},
+                    self.processing_class,
+                    **self.chat_template_kwargs,
+                )["prompt"]
+                for prompt in prompts
+            ]
+            prompt_inputs = self.processing_class(
+                images=images, text=prompts_text, padding=True, return_tensors="pt"
+            )
+            prompt_inputs = super()._prepare_inputs(prompt_inputs)
+            forward_kwargs = {
+                k: v
+                for k, v in prompt_inputs.items()
+                if k not in ["input_ids", "attention_mask"]
+            }
+        else:
+            forward_kwargs = {}
+
+        # If token_type_ids are used, extend them with zeros for the completion part
+        if "token_type_ids" in forward_kwargs:
+            token_type_ids = forward_kwargs["token_type_ids"]
+            forward_kwargs["token_type_ids"] = torch.cat(
+                [token_type_ids, token_type_ids.new_zeros(completion_ids.shape)], dim=1
+            )
+
+        with torch.no_grad():
+            # If the generation and optimization steps are misaligned—i.e., if generation does not occur at the end of
+            # a full optimizer step (when gradient_accumulation_steps is not a multiple of generate_every)—then the
+            # samples may come from an earlier version of the model. In that case, we need to track old_per_token_logps
+            # for importance sampling. If the steps are aligned, importance sampling isn't necessary and we set
+            # old_per_token_logps to None.
+            # When using vLLM, we always compute old_per_token_logps for importance sampling, it was shown that the
+            # distribution mismatch between vLLM and the training model can be large and harm the training.
+            generate_every = (
+                self.args.steps_per_generation * self.num_iterations
+            )  # generation frequency
+            if self.args.gradient_accumulation_steps % generate_every != 0 or (
+                self.use_vllm and self.vllm_importance_sampling_correction
+            ):
+                old_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                    self.model,
+                    prompt_completion_ids,
+                    attention_mask,
+                    logits_to_keep,
+                    batch_size,
+                    num_images=num_images,
+                    **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                )
+            else:
+                old_per_token_logps = None
+
+            # Compute the importance sampling ratio when using vLLM, to correct for potential distribution mismatch
+            if self.use_vllm and self.vllm_importance_sampling_correction:
+                importance_sampling_ratio = torch.exp(
+                    old_per_token_logps - sampling_per_token_logps
+                )
+                importance_sampling_ratio = torch.clamp(
+                    importance_sampling_ratio, max=self.vllm_importance_sampling_cap
+                )
+
+            # Compute the per-token log probabilities for the reference model
+            if self.beta != 0.0:
+                if self.ref_model is not None:
+                    ref_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                        self.ref_model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        logits_to_keep,
+                        batch_size=batch_size,
+                        num_images=num_images,
+                        **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                    )
+                else:
+                    with self.accelerator.unwrap_model(self.model).disable_adapter():
+                        ref_per_token_logps, _ = (
+                            self._get_per_token_logps_and_entropies(
+                                self.model,
+                                prompt_completion_ids,
+                                attention_mask,
+                                logits_to_keep,
+                                batch_size=batch_size,
+                                num_images=num_images,
+                                **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                            )
+                        )
+            else:
+                ref_per_token_logps = None
+
+        # Decode
+        prompts_text = self.processing_class.batch_decode(
+            prompt_ids, skip_special_tokens=True
+        )
+        completions_text = self.processing_class.batch_decode(
+            completion_ids, skip_special_tokens=True
+        )
+        if is_conversational(inputs[0]):
+            completions = []
+            for prompt, completion in zip(prompts, completions_text, strict=True):
+                bootstrap = (
+                    prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+                )
+                if isinstance(
+                    bootstrap, list
+                ):  # for VLM, the format might be [{"type": "text", "text": "..."}]
+                    assert len(bootstrap) == 1 and bootstrap[0]["type"] == "text"
+                    bootstrap = bootstrap[0]["text"]
+                completions.append(
+                    [{"role": "assistant", "content": bootstrap + completion}]
+                )
+        else:
+            completions = completions_text
+
+        # Merge extra_fields from rollout_func into inputs for reward functions
+        if extra_fields:
+            for i, inp in enumerate(inputs):
+                for key, values in extra_fields.items():
+                    if isinstance(values, list) and i < len(values):
+                        inp[key] = values[i]
+                    elif not isinstance(values, list):
+                        inp[key] = values
+
+        # Calculate rewards for each reward function. rewards_per_func aggregates rewards across all processes. This is
+        # important because rewards will be normalized per group, and completions are distributed. We will later slice
+        # rewards_per_func to extract each process's subset.
+        rewards_per_func = self._calculate_rewards(
+            inputs, prompts, completions, completion_ids_list
+        )
+
+        # Apply weights to each reward function's output and sum
+        rewards = (
+            rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
+        ).nansum(dim=1)
+
+        # Compute grouped-wise rewards
+        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
+
+        # Normalize the rewards to compute the advantages
+        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(
+            self.num_generations, dim=0
+        )
+        advantages = rewards - mean_grouped_rewards
+
+        grouped_std_rewards = rewards.view(-1, self.num_generations).std(dim=1)
+        grouped_std_rewards = grouped_std_rewards.repeat_interleave(
+            self.num_generations, dim=0
+        )
+
+        if self.scale_rewards in ["group", "none"]:
+            # If self.scale_rewards = "none", we'll still log group level std
+            std_rewards = grouped_std_rewards.clone()
+        elif self.scale_rewards == "batch":
+            # Compute global std
+            std_rewards = rewards.std().expand_as(rewards)
+        else:
+            raise ValueError(
+                f"Invalid value for scale_rewards: {self.scale_rewards}. Must be one of 'batch', 'group', or 'none'."
+            )
+
+        is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards))
+        if self.scale_rewards != "none":
+            advantages = advantages / (std_rewards + 1e-4)
+
+        # Slice to keep only the local part of the data
+        process_slice = slice(
+            self.accelerator.process_index * len(prompts),
+            (self.accelerator.process_index + 1) * len(prompts),
+        )
+        all_process_advantages = (
+            advantages.clone()
+        )  # keep the aggregated advantages for logging
+        advantages = advantages[process_slice]
+        grouped_std_rewards = grouped_std_rewards[process_slice]
+
+        # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values)
+        for i, reward_func_name in enumerate(self.reward_func_names):
+            mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards)
+            std_func_rewards = nanstd(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/std"].append(
+                std_func_rewards
+            )
+        self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item())
+        self._metrics[mode]["reward_std"].append(std_rewards.mean().item())
+        self._metrics[mode]["frac_reward_zero_std"].append(
+            is_std_zero.float().mean().item()
+        )
+
+        # Log prompt and completion texts
+        self._logs["prompt"].extend(gather_object(prompts_text))
+        self._logs["completion"].extend(gather_object(completions_text))
+        for i, name in enumerate(self.reward_func_names):
+            self._logs["rewards"][name].extend(rewards_per_func[:, i].tolist())
+        self._logs["advantages"].extend(all_process_advantages.tolist())
+
+        if images is not None:
+            self._logs["images"].extend(gather_object(images))
+
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            delta = torch.abs(old_per_token_logps - sampling_per_token_logps)
+            delta = delta[completion_mask.bool()]
+            mean_delta = (
+                torch.mean(delta)
+                if delta.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            max_delta = (
+                torch.max(delta)
+                if delta.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            self._metrics[mode]["sampling/sampling_logp_difference/mean"].append(
+                self.accelerator.gather(mean_delta).mean().item()
+            )
+            self._metrics[mode]["sampling/sampling_logp_difference/max"].append(
+                self.accelerator.gather(max_delta).max().item()
+            )
+
+            flat_is_ratio = importance_sampling_ratio[completion_mask.bool()]
+            min_importance_sampling_ratio = (
+                torch.min(flat_is_ratio)
+                if flat_is_ratio.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            mean_importance_sampling_ratio = (
+                torch.mean(flat_is_ratio)
+                if flat_is_ratio.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            max_importance_sampling_ratio = (
+                torch.max(flat_is_ratio)
+                if flat_is_ratio.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/min"].append(
+                nanmin(self.accelerator.gather(min_importance_sampling_ratio)).item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/mean"].append(
+                self.accelerator.gather(mean_importance_sampling_ratio).nanmean().item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/max"].append(
+                nanmax(self.accelerator.gather(max_importance_sampling_ratio)).item()
+            )
+        outputs_after_sampling_buffer = self.update_with_replay_buffer(
+            advantages,
+            grouped_std_rewards,
+            prompt_ids,
+            prompt_mask,
+            completion_ids,
+            completion_mask,
+            forward_kwargs,
+            num_items_in_batch,
+            old_per_token_logps,
+            ref_per_token_logps,
+            importance_sampling_ratio
+            if self.use_vllm and self.vllm_importance_sampling_correction
+            else None,
+        )
+        if outputs_after_sampling_buffer is not None:
+            return outputs_after_sampling_buffer
+        output = {
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "advantages": advantages,
+            "num_items_in_batch": num_items_in_batch,
+        }
+        if old_per_token_logps is not None:
+            output["old_per_token_logps"] = old_per_token_logps
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            output["importance_sampling_ratio"] = importance_sampling_ratio
+        if ref_per_token_logps is not None:
+            output["ref_per_token_logps"] = ref_per_token_logps
+        if "pixel_values" in forward_kwargs:
+            output["pixel_values"] = forward_kwargs["pixel_values"]
+        if "image_grid_thw" in forward_kwargs:
+            output["image_grid_thw"] = forward_kwargs["image_grid_thw"]
+        if "pixel_attention_mask" in forward_kwargs:
+            output["pixel_attention_mask"] = forward_kwargs["pixel_attention_mask"]
+        if "image_sizes" in forward_kwargs:
+            output["image_sizes"] = forward_kwargs["image_sizes"]
+        if "token_type_ids" in forward_kwargs:
+            output["token_type_ids"] = forward_kwargs["token_type_ids"]
+        if images is not None:
+            output["num_images"] = num_images
+        return output
+
+    def slice_group_data(
+        self, data: torch.Tensor, mask: torch.Tensor, group_idx: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Slices the input data and mask tensors for a specific group index. Also trims the sequence length to the
+        maximum length in the group based on the mask.
+
+        Args:
+            data: Tensor of shape (num_groups * num_generations, seq_length)
+            mask: Tensor of shape (num_groups * num_generations, seq_length)
+            group_idx: Index of the group to slice
+        Returns:
+            Tuple of (sliced_data, sliced_mask) for the specified group, with sequence length trimmed to the maximum
+            length in the group.
+        """
+        start_idx = group_idx * self.num_generations
+        end_idx = (group_idx + 1) * self.num_generations
+        group_data = data[start_idx:end_idx]
+        group_mask = mask[start_idx:end_idx]
+        group_max_len = group_mask.sum(dim=1).max().item()
+        return group_data[:, :group_max_len], group_mask[:, :group_max_len]
+
+    def update_replay_buffer(
+        self,
+        groups_with_variance: torch.Tensor,
+        group_advantages: torch.Tensor,
+        group_std_rewards: torch.Tensor,
+        prompt_ids: torch.Tensor,
+        prompt_mask: torch.Tensor,
+        completion_ids: torch.Tensor,
+        completion_mask: torch.Tensor,
+        forward_kwargs: dict,
+        optional_vision_fields: list[str] = None,
+        old_per_token_logps: torch.Tensor | None = None,
+        ref_per_token_logps: torch.Tensor | None = None,
+        importance_sampling_ratio: float | None = None,
+    ) -> None:
+        """
+        Update the replay buffer with groups that have reward variance (std > 0).
+
+        Args:
+            groups_with_variance: Boolean tensor indicating which groups have reward variance
+            group_advantages: Tensor of shape (num_groups, num_generations) containing advantage values
+            std_rewards: Tensor of shape (num_groups, num_generations) containing std of rewards per group
+            prompt_ids: Tensor containing prompt token IDs
+            prompt_mask: Tensor containing prompt attention masks
+            completion_ids: Tensor containing completion token IDs
+            completion_mask: Tensor containing completion attention masks
+            forward_kwargs: Dictionary containing additional prompt inputs (vision data, etc.)
+            optional_vision_fields: List of optional vision-related fields to include if present in forward_kwargs
+            old_per_token_logps: Optional tensor of old per-token log probabilities
+            ref_per_token_logps: Optional tensor of reference per-token log probabilities
+            importance_sampling_ratio: Optional importance sampling correction ratio
+        """
+        # Prepare buffered outputs for groups with variance
+        buffered_outputs = []
+        for _, group_idx in enumerate(
+            groups_with_variance.nonzero(as_tuple=True)[0].unique().tolist()
+        ):
+            group_prompt_ids, group_prompt_mask = self.slice_group_data(
+                prompt_ids, prompt_mask, group_idx
+            )
+            group_completion_ids, group_completion_mask = self.slice_group_data(
+                completion_ids, completion_mask, group_idx
+            )
+
+            # Store unpadded data in the buffer
+            buffered_output = {
+                "prompt_ids": group_prompt_ids,
+                "completion_ids": group_completion_ids,
+                "advantages": group_advantages[group_idx].tolist(),
+                "prompt_mask": group_prompt_mask,
+                "completion_mask": group_completion_mask,
+            }
+
+            # Add optional fields if they exist
+            optional_fields = {
+                "old_per_token_logps": old_per_token_logps
+                if old_per_token_logps is not None
+                else None,
+                "ref_per_token_logps": ref_per_token_logps
+                if ref_per_token_logps is not None
+                else None,
+            }
+
+            for field_name, field_data in optional_fields.items():
+                if field_data is not None:
+                    buffered_output[field_name] = self.slice_group_data(
+                        field_data, completion_mask, group_idx
+                    )[0]
+
+            # Add importance sampling if needed
+            if self.use_vllm and self.vllm_importance_sampling_correction:
+                buffered_output["importance_sampling_ratio"] = importance_sampling_ratio
+
+            if optional_vision_fields:
+                # Add vision-related fields if they exist
+                for field_name in optional_vision_fields:
+                    if field_name in forward_kwargs:
+                        buffered_output[field_name] = self.slice_group_data(
+                            forward_kwargs[field_name], prompt_mask, group_idx
+                        )[0]
+
+            buffered_outputs.append(buffered_output)
+
+        if groups_with_variance.any():
+            # Calculate replay buffer scores for groups with variance
+            replay_buffer_scores = (group_advantages.abs() * group_std_rewards).sum(
+                dim=-1
+            )[groups_with_variance]
+            # Add all groups to replay buffer at once (batch operation)
+            self.replay_buffer.add(replay_buffer_scores.tolist(), buffered_outputs)
+
+    def sample_from_replay_buffer(
+        self,
+        num_samples: int,
+        optional_vision_fields: list[str] = None,
+        optional_tensor_fields: list[str] = None,
+    ) -> list[dict]:
+        """
+        Sample groups from the replay buffer.
+
+        Args:
+            num_samples: Number of samples to draw from the replay buffer
+            optional_vision_fields: List of optional vision-related fields to include if present in sampled data
+            optional_tensor_fields: List of optional tensor fields to include if present in sampled data
+        Returns:
+            List of sampled data dictionaries from the replay buffer
+        """
+        sampled = self.replay_buffer.sample(num_samples=num_samples)
+
+        # Extract and concatenate sampled data
+        sampled_data = {
+            "prompt_ids": [],
+            "prompt_mask": [],
+            "completion_ids": [],
+            "completion_mask": [],
+            "advantages": [],
+        }
+
+        all_optional_fields = (optional_tensor_fields or []) + (
+            optional_vision_fields or []
+        )
+        # Initialize containers for optional fields if they exist in sampled data
+        for field in all_optional_fields:
+            if sampled and field in sampled[0]:
+                sampled_data[field] = []
+
+        # Extract data from each sampled item
+        for item in sampled:
+            # Handle core fields
+            for key in [
+                "prompt_ids",
+                "prompt_mask",
+                "completion_ids",
+                "completion_mask",
+            ]:
+                sampled_data[key].append(item[key])
+
+            # Handle advantages (list, not tensor)
+            sampled_data["advantages"].append(item["advantages"])
+
+            # Handle optional fields
+            for field in all_optional_fields:
+                if field in item:
+                    sampled_data[field].append(item[field])
+
+        return sampled_data
+
+    def update_with_replay_buffer(
+        self,
+        group_advantages: torch.Tensor,
+        group_std_rewards: torch.Tensor,
+        prompt_ids: torch.Tensor,
+        prompt_mask: torch.Tensor,
+        completion_ids: torch.Tensor,
+        completion_mask: torch.Tensor,
+        forward_kwargs: dict,
+        num_items_in_batch: int,
+        old_per_token_logps: torch.Tensor | None = None,
+        ref_per_token_logps: torch.Tensor | None = None,
+        importance_sampling_ratio: float | None = None,
+    ) -> None:
+        """
+        Update current batch data with samples from replay buffer.
+
+        Groups with reward variance (std > 0) are added to the replay buffer and then replaced with samples from the
+        buffer to improve training stability.
+
+        Args:
+            group_advantages: Tensor of shape (num_groups, num_generations) containing advantage values
+            std_rewards: Tensor of shape (num_groups, num_generations) containing std of rewards per group
+            prompt_ids: Tensor containing prompt token IDs
+            prompt_mask: Tensor containing prompt attention masks
+            completion_ids: Tensor containing completion token IDs
+            completion_mask: Tensor containing completion attention masks
+            forward_kwargs: Dictionary containing additional prompt inputs (vision data, etc.)
+            num_items_in_batch: Number of items in the current batch
+            old_per_token_logps: Optional tensor of old per-token log probabilities
+            ref_per_token_logps: Optional tensor of reference per-token log probabilities
+            importance_sampling_ratio: Optional importance sampling correction ratio
+        """
+        if self.replay_buffer.max_size <= 0:
+            return None
+
+        # Groups to consider for adding to the replay buffer
+        groups_with_variance = group_std_rewards.max(dim=0).values > 0
+        # Groups to replace from the replay buffer
+        groups_without_variance = ~groups_with_variance
+
+        # Track which optional fields are present in sampled data
+        optional_tensor_fields = ["old_per_token_logps", "ref_per_token_logps"]
+        vision_fields = [
+            "pixel_values",
+            "image_grid_thw",
+            "pixel_attention_mask",
+            "image_sizes",
+        ]
+
+        self.update_replay_buffer(
+            groups_with_variance,
+            group_advantages,
+            group_std_rewards,
+            prompt_ids,
+            prompt_mask,
+            completion_ids,
+            completion_mask,
+            forward_kwargs,
+            vision_fields,
+            old_per_token_logps,
+            ref_per_token_logps,
+            importance_sampling_ratio,
+        )
+
+        # Sample from replay buffer to replace groups with variance
+        num_groups_to_replace = groups_without_variance.sum().item()
+        if not num_groups_to_replace:
+            return None
+
+        sampled_data = self.sample_from_replay_buffer(
+            num_samples=num_groups_to_replace,
+            optional_vision_fields=vision_fields,
+            optional_tensor_fields=optional_tensor_fields,
+        )
+
+        # Pad sampled data if they are shorter than the current batch sequences
+        # Or pad the current batch if sampled are longer
+        current_batch_prompt_seq_len = prompt_ids.size(1)
+        current_batch_completion_seq_len = completion_ids.size(1)
+
+        groups_to_replace_idxs = (
+            groups_with_variance.logical_not()
+            .nonzero(as_tuple=True)[0]
+            .unique()
+            .tolist()
+        )
+
+        # Determine target (max) sequence lengths once
+        sampled_prompt_lengths = [t.size(1) for t in sampled_data["prompt_ids"]]
+        sampled_completion_lengths = [t.size(1) for t in sampled_data["completion_ids"]]
+        target_prompt_len = max([current_batch_prompt_seq_len] + sampled_prompt_lengths)
+        target_completion_len = max(
+            [current_batch_completion_seq_len] + sampled_completion_lengths
+        )
+
+        # If any sampled prompt is longer, pad the whole batch prompt tensors once (left padding)
+        if target_prompt_len > current_batch_prompt_seq_len:
+            prompt_ids = pad(
+                list(prompt_ids.unbind(0)),
+                padding_value=self.pad_token_id,
+                pad_to_multiple_of=target_prompt_len,
+                padding_side="left",
+            )
+            prompt_mask = pad(
+                list(prompt_mask.unbind(0)),
+                padding_value=0,
+                pad_to_multiple_of=target_prompt_len,
+                padding_side="left",
+            )
+        # If any sampled completion is longer, pad the whole batch completion tensors once (right padding)
+        if target_completion_len > current_batch_completion_seq_len:
+            completion_ids = pad(
+                list(completion_ids.unbind(0)),
+                padding_value=self.pad_token_id,
+                pad_to_multiple_of=target_completion_len,
+                padding_side="right",
+            )
+            completion_mask = pad(
+                list(completion_mask.unbind(0)),
+                padding_value=0,
+                pad_to_multiple_of=target_completion_len,
+                padding_side="right",
+            )
+            if old_per_token_logps is not None:
+                old_per_token_logps = pad(
+                    list(old_per_token_logps.unbind(0)),
+                    padding_value=0.0,
+                    pad_to_multiple_of=target_completion_len,
+                    padding_side="right",
+                )
+            if ref_per_token_logps is not None:
+                ref_per_token_logps = pad(
+                    list(ref_per_token_logps.unbind(0)),
+                    padding_value=0.0,
+                    pad_to_multiple_of=target_completion_len,
+                    padding_side="right",
+                )
+
+        # Replace per-group data, padding only sampled groups that are shorter than the target
+        for i, group_idx in enumerate(groups_to_replace_idxs):
+            start_idx = group_idx * self.num_generations
+            end_idx = (group_idx + 1) * self.num_generations
+            idx_range = slice(start_idx, end_idx)
+
+            # Pad sampled prompt to target length if needed
+            if sampled_data["prompt_ids"][i].size(1) < target_prompt_len:
+                sampled_data["prompt_ids"][i] = pad(
+                    sampled_data["prompt_ids"][i],
+                    padding_value=self.pad_token_id,
+                    pad_to_multiple_of=target_prompt_len,
+                    padding_side="left",
+                )
+                sampled_data["prompt_mask"][i] = pad(
+                    sampled_data["prompt_mask"][i],
+                    padding_value=0,
+                    pad_to_multiple_of=target_prompt_len,
+                    padding_side="left",
+                )
+
+            # Pad sampled completion to target length if needed
+            if sampled_data["completion_ids"][i].size(1) < target_completion_len:
+                sampled_data["completion_ids"][i] = pad(
+                    sampled_data["completion_ids"][i],
+                    padding_value=self.pad_token_id,
+                    pad_to_multiple_of=target_completion_len,
+                    padding_side="right",
+                )
+                sampled_data["completion_mask"][i] = pad(
+                    sampled_data["completion_mask"][i],
+                    padding_value=0,
+                    pad_to_multiple_of=target_completion_len,
+                    padding_side="right",
+                )
+                if "old_per_token_logps" in sampled_data:
+                    sampled_data["old_per_token_logps"][i] = pad(
+                        sampled_data["old_per_token_logps"][i],
+                        padding_value=0.0,
+                        pad_to_multiple_of=target_completion_len,
+                        padding_side="right",
+                    )
+                if "ref_per_token_logps" in sampled_data:
+                    sampled_data["ref_per_token_logps"][i] = pad(
+                        sampled_data["ref_per_token_logps"][i],
+                        padding_value=0.0,
+                        pad_to_multiple_of=target_completion_len,
+                        padding_side="right",
+                    )
+
+            # Assign (replace) group slice
+            prompt_ids[idx_range] = sampled_data["prompt_ids"][i]
+            prompt_mask[idx_range] = sampled_data["prompt_mask"][i]
+            completion_ids[idx_range] = sampled_data["completion_ids"][i]
+            completion_mask[idx_range] = sampled_data["completion_mask"][i]
+            group_advantages[group_idx] = sampled_data["advantages"][i]
+
+            if "old_per_token_logps" in sampled_data:
+                old_per_token_logps[idx_range] = sampled_data["old_per_token_logps"][i]
+            if "ref_per_token_logps" in sampled_data:
+                ref_per_token_logps[idx_range] = sampled_data["ref_per_token_logps"][i]
+
+            for field in vision_fields:
+                if field in sampled_data and field in forward_kwargs:
+                    forward_kwargs[field][idx_range] = sampled_data[field][i]
+
+        # Prepare final outputs after sampling and replacement
+        outputs_after_sampling_buffer = {
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "advantages": group_advantages,
+        }
+
+        # Replace optional tensor fields if they exist
+        for field in optional_tensor_fields:
+            if field in sampled_data:
+                outputs_after_sampling_buffer[field] = (
+                    old_per_token_logps
+                    if field == "old_per_token_logps"
+                    else ref_per_token_logps
+                )
+
+        # Replace vision fields if they exist
+        for field in vision_fields:
+            if field in sampled_data and field in forward_kwargs:
+                outputs_after_sampling_buffer[field] = forward_kwargs[field]
+
+        outputs_after_sampling_buffer["num_items_in_batch"] = num_items_in_batch
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            outputs_after_sampling_buffer["importance_sampling_ratio"] = (
+                importance_sampling_ratio
+            )
+
+        return outputs_after_sampling_buffer
diff --git a/src/aixpert/training/training/trl/experimental/gspo_token/__init__.py b/src/aixpert/training/training/trl/experimental/gspo_token/__init__.py
new file mode 100644
index 0000000..1482c63
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/gspo_token/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .grpo_trainer import GRPOTrainer
diff --git a/src/aixpert/training/training/trl/experimental/gspo_token/grpo_trainer.py b/src/aixpert/training/training/trl/experimental/gspo_token/grpo_trainer.py
new file mode 100644
index 0000000..59bef88
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/gspo_token/grpo_trainer.py
@@ -0,0 +1,198 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...trainer.grpo_trainer import GRPOTrainer as _GRPOTrainer
+from ...trainer.utils import nanmax, nanmin
+
+
+class GRPOTrainer(_GRPOTrainer):
+    def _compute_loss(self, model, inputs):
+        # Compute the per-token log probabilities for the model
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = (
+            inputs["completion_ids"],
+            inputs["completion_mask"],
+        )
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        logits_to_keep = completion_ids.size(
+            1
+        )  # we only need to compute the logits for the completion tokens
+
+        # Compute the per_token_logps and the entropy at each position in the completion
+        per_token_logps, entropies = self._get_per_token_logps_and_entropies(
+            model,
+            input_ids,
+            attention_mask,
+            logits_to_keep,
+            compute_entropy=True,
+            pixel_values=inputs.get("pixel_values"),
+            image_grid_thw=inputs.get("image_grid_thw"),
+            num_images=inputs.get("num_images"),
+            pixel_attention_mask=inputs.get("pixel_attention_mask"),
+            image_sizes=inputs.get("image_sizes"),
+            token_type_ids=inputs.get("token_type_ids"),
+        )
+
+        if self.top_entropy_quantile < 1.0:
+            entropy_mask = self.get_high_entropy_mask(
+                entropies, completion_mask, 1 - self.top_entropy_quantile
+            )
+        else:
+            entropy_mask = None
+
+        # Compute the KL divergence between the model and the reference model
+        if self.beta != 0.0:
+            ref_per_token_logps = inputs["ref_per_token_logps"]
+            per_token_kl = (
+                torch.exp(ref_per_token_logps - per_token_logps)
+                - (ref_per_token_logps - per_token_logps)
+                - 1
+            )
+
+        # Compute the loss
+        advantages = inputs["advantages"]
+        # When num_iterations == 1 and steps_per_generation <= gradient_accumulation_steps,
+        # old_per_token_logps == per_token_logps. In this case we can skip its computation
+        # (see _generate_and_score_completions) and instead use per_token_logps.detach().
+        # The exception is when using vLLM, where we always compute old_per_token_logps
+        # for importance sampling
+        old_per_token_logps = inputs.get("old_per_token_logps")
+        old_per_token_logps = (
+            per_token_logps.detach()
+            if old_per_token_logps is None
+            else old_per_token_logps
+        )
+
+        log_ratio = per_token_logps - old_per_token_logps
+        if self.importance_sampling_level == "token":
+            log_importance_weights = log_ratio
+        elif self.importance_sampling_level == "sequence":
+            log_importance_weights = (log_ratio * completion_mask).sum(
+                -1
+            ) / completion_mask.sum(-1).clamp(min=1.0)
+            log_importance_weights = log_importance_weights.unsqueeze(-1)
+        elif self.importance_sampling_level == "sequence_token":
+            # GSPO-token: sg[si(θ)] * πθ(yi,t)/sg[πθ(yi,t)]
+            seq_level_log_weight = (log_ratio * completion_mask).sum(
+                -1
+            ) / completion_mask.sum(-1).clamp(min=1.0)
+            seq_level_log_weight = seq_level_log_weight.detach().unsqueeze(
+                -1
+            )  # Stop gradient
+            log_importance_weights = (
+                per_token_logps - per_token_logps.detach() + seq_level_log_weight
+            )
+        else:
+            raise ValueError(
+                f"Unknown importance sampling level: {self.importance_sampling_level}. Possible values are 'token' "
+                "and 'sequence'."
+            )
+        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
+        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
+
+        coef_1 = torch.exp(log_importance_weights)
+        coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
+
+        # Two-sided clipping
+        if self.args.delta is not None:
+            coef_1 = torch.clamp(coef_1, max=self.args.delta)
+
+        per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+        per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        if entropy_mask is not None:
+            per_token_loss = per_token_loss * entropy_mask
+
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            per_token_loss = per_token_loss * inputs["importance_sampling_ratio"]
+
+        if self.beta != 0.0:
+            per_token_loss = per_token_loss + self.beta * per_token_kl
+
+        if self.loss_type == "grpo":
+            loss = (
+                (per_token_loss * completion_mask).sum(-1)
+                / completion_mask.sum(-1).clamp(min=1.0)
+            ).mean()
+            loss = loss / self.current_gradient_accumulation_steps
+        elif self.loss_type == "bnpo":
+            loss = (
+                per_token_loss * completion_mask
+            ).sum() / completion_mask.sum().clamp(min=1.0)
+            loss = loss / self.current_gradient_accumulation_steps
+        elif self.loss_type == "dr_grpo":
+            loss = (per_token_loss * completion_mask).sum() / (
+                per_token_loss.size(0) * self.max_completion_length
+            )
+            loss = loss / self.current_gradient_accumulation_steps
+        elif self.loss_type == "dapo":
+            normalizer = inputs["num_items_in_batch"] / self.accelerator.num_processes
+            loss = (per_token_loss * completion_mask).sum() / normalizer
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}")
+
+        # Log the metrics
+        mode = "train" if self.model.training else "eval"
+
+        completion_token_count = completion_mask.sum().clamp(min=1.0)
+
+        def masked_batch_mean(x):
+            if x.shape[1] == 1:  # when importance_sampling_level == "sequence"
+                return x.mean()
+            return (x * completion_mask).sum() / completion_token_count
+
+        if self.beta != 0.0:
+            mean_kl = masked_batch_mean(per_token_kl)
+            self._metrics[mode]["kl"].append(
+                self.accelerator.gather(mean_kl).nanmean().item()
+            )
+
+        mean_entropy = masked_batch_mean(entropies)
+        self._metrics[mode]["entropy"].append(
+            self.accelerator.gather(mean_entropy).nanmean().item()
+        )
+
+        # Compute the clipped probability ratios
+        is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages.unsqueeze(1) < 0)
+        is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (
+            advantages.unsqueeze(1) > 0
+        )
+        is_region_clipped = is_low_clipped | is_high_clipped
+
+        low_clip = masked_batch_mean(is_low_clipped.float())
+        high_clip = masked_batch_mean(is_high_clipped.float())
+        clip_ratio = masked_batch_mean(is_region_clipped.float())
+
+        gathered_low_clip = self.accelerator.gather(low_clip)
+        self._metrics[mode]["clip_ratio/low_mean"].append(
+            gathered_low_clip.nanmean().item()
+        )
+        self._metrics[mode]["clip_ratio/low_min"].append(
+            nanmin(gathered_low_clip).item()
+        )
+        gathered_high_clip = self.accelerator.gather(high_clip)
+        self._metrics[mode]["clip_ratio/high_mean"].append(
+            gathered_high_clip.nanmean().item()
+        )
+        self._metrics[mode]["clip_ratio/high_max"].append(
+            nanmax(gathered_high_clip).item()
+        )
+        gathered_clip_ratio = self.accelerator.gather(clip_ratio)
+        self._metrics[mode]["clip_ratio/region_mean"].append(
+            gathered_clip_ratio.nanmean().item()
+        )
+        return loss
diff --git a/src/aixpert/training/training/trl/experimental/judges/__init__.py b/src/aixpert/training/training/trl/experimental/judges/__init__.py
new file mode 100644
index 0000000..332e949
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/judges/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .judges import (
+    AllTrueJudge,
+    BaseBinaryJudge,
+    BaseJudge,
+    BasePairwiseJudge,
+    BaseRankJudge,
+    HfPairwiseJudge,
+    OpenAIPairwiseJudge,
+    PairRMJudge,
+)
+
+
+__all__ = [
+    "AllTrueJudge",
+    "BaseBinaryJudge",
+    "BaseJudge",
+    "BasePairwiseJudge",
+    "BaseRankJudge",
+    "HfPairwiseJudge",
+    "OpenAIPairwiseJudge",
+    "PairRMJudge",
+]
diff --git a/src/aixpert/training/training/trl/experimental/judges/judges.py b/src/aixpert/training/training/trl/experimental/judges/judges.py
new file mode 100644
index 0000000..c8240d0
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/judges/judges.py
@@ -0,0 +1,524 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import concurrent.futures
+import logging
+from abc import ABC, abstractmethod
+
+import numpy as np
+from accelerate import Accelerator
+from huggingface_hub import InferenceClient
+from transformers.utils import is_openai_available
+
+from ...import_utils import is_llm_blender_available
+
+
+if is_llm_blender_available():
+    import llm_blender
+
+if is_openai_available():
+    from openai import OpenAI
+
+
+DEFAULT_PAIRWISE_SYSTEM_PROMPT = '''I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
+
+## Instruction
+
+{{
+    "instruction": """{prompt}""",
+}}
+
+## Model Outputs
+
+Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
+
+{{
+    {{
+        "model_identifier": "0",
+        "output": """{response0}"""
+    }},
+    {{
+        "model_identifier": "1",
+        "output": """{response1}"""
+    }}
+}}
+
+## Task
+
+Evaluate the models on the basis of the quality and relevance of their results, and select the model that generated the best result. Reply with the identifier of the best model. Our evaluation will only take into account the first character of your answer, so make sure it contains only one of the identifiers and nothing else (no quotation marks, no spaces, no new lines, ...).
+'''
+
+
+class BaseJudge(ABC):
+    """
+    Base class for judges. The subclasses of this class should implement the `judge` method.
+    """
+
+    @abstractmethod
+    def judge(
+        self, prompts: list[str], completions: list[str], shuffle_order: bool = True
+    ) -> list:
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+
+
+class BaseRankJudge(ABC):
+    """
+    Base class for LLM ranking judges.
+
+    **Example**:
+    ```python
+    class MyRankJudge(BaseRankJudge):
+        def judge(self, prompts, completions, shuffle_order=True):
+            return ...  # Your ranking logic here
+
+
+    judge = MyRankJudge()
+    judge.judge(
+        prompts=["The capital of France is", "The capital of Germany is"],
+        completions=[[" Paris", " Marseille", "Lyon"], [" Munich", " Berlin"]],
+    )  # [[0, 1, 2], [1, 0]]
+    ```
+    """
+
+    @abstractmethod
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[list[str]],
+        shuffle_order: bool = True,
+    ) -> list[list[int]]:
+        """
+        Judge the completion for the given prompts and return the ranks of each completion.
+
+        Args:
+            prompts (`list[str]`):
+                List of prompts.
+            completions (`list[list[str]]`):
+                List of completions list, where each element is a list of completions for the corresponding prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
+
+        Returns
+        -------
+            `list[list[int]]`:
+                List of lists of idxs, where each list contains the ranks of the completions for the corresponding
+                prompt. E.g., `[1, 2, 0]` means that the second completion (`idx=1`) is the best, followed by the
+                third, and then the first.
+        """
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+
+
+class BasePairwiseJudge(BaseJudge):
+    """
+    Base class for pairwise judges.
+    """
+
+    @abstractmethod
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[list[str]],
+        shuffle_order: bool = True,
+    ) -> list[int]:
+        """
+        Judge the completion pairs for the given prompts.
+
+        Args:
+            prompts (`list[str]`):
+                List of prompts.
+            completions (`list[list[str]]`):
+                List of completions pairs, where each element is a pair of completions for the corresponding prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
+
+        Returns
+        -------
+            `list[int]`:
+                List of idxs, where each idx is the rank of the best completion for the corresponding prompt. E.g., `1`
+                means that the second completion (`idx=1`) is the best.
+
+        Note:
+            If the judge returns `-1` for any prompt, it indicates that the inner process used to compute the
+            preference has failed. For instance, this could occur if the underlying language model returned an invalid
+            answer. In such cases, the caller should handle these invalid indices appropriately, possibly by
+            implementing fallback logic or error handling.
+        """
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+
+
+class BaseBinaryJudge(BaseJudge):
+    """
+    Base class for binary judges.
+    """
+
+    @abstractmethod
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[str],
+        gold_completions: list[str] | None = None,
+        shuffle_order: bool = True,
+    ) -> list[int]:
+        """
+        Judge the completion for a given prompt. Used to assess if a completion satisfies a constraint.
+
+        This base class should be used to implement binary evaluations as done in section 4.1.4 of the [CGPO
+        paper](https://huggingface.co/papers/2409.20370). It is relevant for assessing whether a prompt-completion pair
+        satisfies a specific constraint.
+
+        Args:
+            prompts (`list[str]`): List of prompts.
+            completions (`list[str]`): List of completions.
+            gold_completions (`list[str]`, `optional`): List of gold completions if it exists.
+            shuffle_order (`bool`): Whether to shuffle the order of the completions to avoid positional bias.
+
+        Returns
+        -------
+            list[int]: A list of binary labels:
+                - 1 indicates that the completion satisfies the evaluated constraint.
+                - 0 indicates that the completion does not satisfy the evaluated constraint.
+
+        Note:
+            If the judge returns -1 for any prompt, it indicates that the inner process used to compute the preference
+            has failed. For instance, this could occur if the underlying language model or rule based constraint
+            returned an invalid answer. In such cases, the caller should handle these invalid indices appropriately,
+            possibly by implementing fallback logic or error handling.
+        """
+        raise NotImplementedError("Judge subclasses must implement the `judge` method.")
+
+
+class PairRMJudge(BasePairwiseJudge):
+    # docstyle-ignore
+    """
+    LLM judge based on the PairRM model from AllenAI.
+
+    This judge uses the PairRM model to rank pairs of completions for given prompts. It's designed for pairwise
+    comparison of language model outputs. The PairRM model is loaded using the llm-blender library and runs on the
+    default Accelerator device.
+
+    **Attributes**:
+
+        blender (`llm_blender.Blender`):
+            An instance of the Blender class from llm-blender.
+
+    **Example**:
+    ```python
+    >>> pairrm_judge = PairRMJudge()
+    >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"]
+    >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]]
+    >>> results = pairrm_judge.judge(prompts, completions)
+    >>> print(results)  # [0, 1] (indicating the first completion is preferred for the first prompt and the second)
+    ```
+
+    > [!TIP]
+    > This class requires the llm-blender library to be installed. Install it with: `pip install llm-blender`.
+    """
+
+    def __init__(self):
+        if not is_llm_blender_available():
+            raise ValueError(
+                "llm-blender is not installed. Please install it with `pip install llm-blender`."
+            )
+        self.blender = llm_blender.Blender()
+        self.blender.loadranker("llm-blender/PairRM", device=Accelerator().device)
+
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[list[str]],
+        shuffle_order: bool = True,
+        return_scores: bool = False,
+        temperature: float = 1.0,
+    ) -> list[int | float]:
+        """
+        Judge the completion pairs for the given prompts using the PairRM model.
+
+        Args:
+            prompts (`list[str]`):
+                List of prompts to judge.
+            completions (`list[list[str]]`):
+                List of completion pairs for each prompt.
+            shuffle_order (`bool`, *optional*, defaults to `True`):
+                Whether to shuffle the order of the completions to avoid positional bias.
+            return_scores (`bool`, *optional*, defaults to `False`):
+                If `True`, return probability scores of the first completion instead of ranks (i.e. a *soft-judge*).
+            temperature (`float`, *optional*, defaults to `1.0`):
+                Temperature for scaling logits if `return_scores` is True.
+
+        Returns
+        -------
+            `list[int | float]`:
+                If `return_scores` is `False`, returns a list of ranks (`0` or `1`) for each prompt, indicating which
+                completion is preferred. If `return_scores` is `True`, returns softmax probabilities for the first
+                completion.
+
+        Raises
+        ------
+            `ValueError`:
+                If the number of completions per prompt is not exactly 2.
+
+        Note:
+            Unlike llm-blender, ranks are 0-indexed (`0` means the first completion is preferred).
+        """
+        if len(completions[0]) != 2:
+            raise ValueError("PairRM judge requires exactly 2 completions per prompt.")
+
+        # Shuffle the order of the completions to avoid positional bias
+        if shuffle_order:
+            flip_mask = np.random.choice([True, False], size=len(prompts))
+            completions = [
+                pair[::-1] if flip else pair
+                for flip, pair in zip(flip_mask, completions, strict=True)
+            ]
+
+        # Rank the completions
+        ranks = self.blender.rank(
+            prompts, completions, return_scores=return_scores, disable_tqdm=True
+        )
+        if not return_scores:
+            ranks -= (
+                1  # PairRM rank is 1-indexed, so we subtract 1 to make it 0-indexed
+            )
+        else:
+            # scale the logits by temperature
+            ranks /= temperature
+
+        # Flip back the ranks or scores to the original order if needed
+        if shuffle_order:
+            ranks[flip_mask] = ranks[flip_mask][:, ::-1]
+
+        # Return the ranks or score probability
+        if return_scores:
+            logit_max = np.amax(ranks, axis=-1, keepdims=True)
+            exp_logit_shifted = np.exp(ranks - logit_max)
+            probs = exp_logit_shifted / np.sum(
+                exp_logit_shifted, axis=-1, keepdims=True
+            )
+            return probs[:, 0].tolist()
+        return ranks[:, 0].tolist()
+
+
+class HfPairwiseJudge(BasePairwiseJudge):
+    """
+    Pairwise judge based on the Hugging Face API with chat completion.
+
+    This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
+
+    Args:
+        model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3-70B-Instruct"`):
+            Model to use for the judge.
+        token (`str`, *optional*):
+            Hugging Face API token to use for the [`huggingface_hub.InferenceClient`].
+        system_prompt (`str`, *optional*):
+            The system prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
+            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
+            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
+            response.
+    """
+
+    def __init__(
+        self,
+        model="meta-llama/Meta-Llama-3-70B-Instruct",
+        token: str | None = None,
+        system_prompt: str | None = None,
+    ):
+        self.client = InferenceClient(model=model, token=token)
+        self.system_prompt = system_prompt or DEFAULT_PAIRWISE_SYSTEM_PROMPT
+
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[list[str]],
+        shuffle_order: bool = True,
+    ) -> list[int]:
+        # Shuffle the order of the completions to avoid positional bias
+        if shuffle_order:
+            flip_mask = np.random.choice([True, False], size=len(prompts))
+            completions = [
+                pair[::-1] if flip else pair
+                for flip, pair in zip(flip_mask, completions, strict=True)
+            ]
+
+        # Define a function to get the rank for a single prompt, will be called concurrently
+        def get_rank(prompt, candidates):
+            content = self.system_prompt.format(
+                prompt=prompt, response0=candidates[0], response1=candidates[1]
+            )
+            completion = self.client.chat_completion(
+                messages=[{"role": "user", "content": content}], max_tokens=1
+            )
+            response = completion.choices[0].message.content
+            if response in ["0", "1"]:
+                return int(response)
+            logging.debug(
+                f"Invalid response from the judge model: '{response}'. Returning -1."
+            )
+            return -1
+
+        # Call the completions concurrently
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            ranks = list(executor.map(get_rank, prompts, completions))
+
+        # Flip back the ranks to the original order if needed
+        if shuffle_order:
+            ranks = [
+                ranks[i] if not flip else 1 - ranks[i]
+                for i, flip in enumerate(flip_mask)
+            ]
+
+        # Return the ranks
+        return ranks
+
+
+class OpenAIPairwiseJudge(BasePairwiseJudge):
+    """
+    Judge based on the OpenAI API.
+
+    This judge is relevant for assessing the quality chat models, where the completion is a response to a given prompt.
+
+    Args:
+        model (`str`, *optional*, defaults to `"gpt-4-turbo-preview"`):
+            Model to use for the judge.
+        system_prompt (`str`, *optional*):
+            System prompt to be used for the judge. If not provided, a default prompt is used. Note that the system
+            prompt should contain the following placeholders: `{prompt}`, `{response0}`, and `{response1}`. Also, the
+            inference is called with `max_tokens=1`, consequently the system prompt should ask for a single token
+            response.
+        max_requests (`int` or `None`, *optional*, defaults to `1000`):
+            Maximum number of requests to make to the OpenAI API. If set to `None`, there is no limit.
+    """
+
+    def __init__(
+        self,
+        model="gpt-4-turbo-preview",
+        system_prompt: str | None = None,
+        max_requests: int | None = 1_000,
+    ):
+        if not is_openai_available():
+            raise ValueError(
+                "OpenAI client is not installed. Please install it with 'pip install openai'."
+            )
+        self.client = OpenAI()
+        self.model = model
+        self.system_prompt = system_prompt or DEFAULT_PAIRWISE_SYSTEM_PROMPT
+        self.max_requests = max_requests
+        self.num_requests = 0
+        self._warned = False
+
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[list[str]],
+        shuffle_order: bool = True,
+    ) -> list[int]:
+        # Check if the limit of requests is reached, if so, use random choice instead
+        if self.max_requests is not None and self.num_requests >= self.max_requests:
+            if not self._warned:  # Print the warning only once
+                logging.warning(
+                    f"Reached the maximum number of requests ({self.max_requests}). From now on, returning -1 instead. "
+                    " To increase the limit, set `max_requests` to a higher value, or to `None` for no limit."
+                )
+                self._warned = True
+            return [-1] * len(prompts)
+
+        # Shuffle the order of the completions to avoid positional bias
+        if shuffle_order:
+            flip_mask = np.random.choice([True, False], size=len(prompts))
+            completions = [
+                pair[::-1] if flip else pair
+                for flip, pair in zip(flip_mask, completions, strict=True)
+            ]
+
+        # Define a function to get the rank for a single prompt, will be called concurrently
+        def get_rank(prompt, candidates):
+            content = self.system_prompt.format(
+                prompt=prompt, response0=candidates[0], response1=candidates[1]
+            )
+            messages = [{"role": "user", "content": content}]
+            completion = self.client.chat.completions.create(
+                model=self.model, messages=messages, max_tokens=1
+            )
+            response = completion.choices[0].message.content
+            if response in ["0", "1"]:
+                return int(response)
+            logging.debug(
+                f"Invalid response from the judge model: '{response}'. Returning -1."
+            )
+            return -1
+
+        # Call the completions concurrently
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            ranks = list(executor.map(get_rank, prompts, completions))
+
+        # Flip back the ranks to the original order if needed
+        if shuffle_order:
+            ranks = [
+                ranks[i] if not flip else 1 - ranks[i]
+                for i, flip in enumerate(flip_mask)
+            ]
+
+        # Update the number of requests
+        self.num_requests += len(prompts)
+
+        # Return the ranks
+        return ranks
+
+
+class AllTrueJudge(BaseBinaryJudge):
+    """
+    Unify the decision of multiple [`experimental.judges.BaseBinaryJudge`] instances.
+
+    Returns `1` only if all inner binary judges return `1`. If any judge returns `0`, it returns `0`. If any judge
+    returns `-1`, indicating a failure in its process, this judge will also return `-1`.
+
+    Implements the Mixture of Judges as described in the [CGPO paper](https://huggingface.co/papers/2409.20370).
+
+    Args:
+        judges (`list[BaseBinaryJudge]`):
+            A list of [`experimental.judges.BaseBinaryJudge`] instances whose decisions will be unified.
+    """
+
+    def __init__(self, judges: list[BaseBinaryJudge]):
+        self.judges = judges
+
+    def judge(
+        self,
+        prompts: list[str],
+        completions: list[str],
+        gold_completions: list[str] | None = None,
+        shuffle_order: bool = True,
+    ) -> list[int]:
+        all_binary_judgments = [
+            judge.judge(prompts, completions, gold_completions, shuffle_order)
+            for judge in self.judges
+        ]
+        output = []
+        for binary_judgments in zip(*all_binary_judgments, strict=True):
+            # Check that all values are in {0, 1, -1}
+            if any(
+                binary_judgment not in {0, 1, -1}
+                for binary_judgment in binary_judgments
+            ):
+                raise ValueError(
+                    f"Invalid binary judgment: {binary_judgments}, expected list of values in {{0, 1, -1}}."
+                )
+
+            # Unify the decision
+            if -1 in binary_judgments:
+                output.append(-1)
+            elif all(binary_judgment == 1 for binary_judgment in binary_judgments):
+                output.append(1)
+            else:
+                output.append(0)
+        return output
diff --git a/src/aixpert/training/training/trl/experimental/papo/__init__.py b/src/aixpert/training/training/trl/experimental/papo/__init__.py
new file mode 100644
index 0000000..d976785
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/papo/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .papo_config import PAPOConfig
+from .papo_trainer import PAPOTrainer
diff --git a/src/aixpert/training/training/trl/experimental/papo/papo_config.py b/src/aixpert/training/training/trl/experimental/papo/papo_config.py
new file mode 100644
index 0000000..ec71ee4
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/papo/papo_config.py
@@ -0,0 +1,77 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Literal
+
+from ...trainer.grpo_config import GRPOConfig
+
+
+@dataclass
+class PAPOConfig(GRPOConfig):
+    """
+    Configuration class for PAPOTrainer.
+
+    PAPO (Perception-Aware Policy Optimization) extends GRPO/DAPO for multimodal reasoning by adding an implicit
+    perception loss and double entropy regularization.
+
+    Args:
+        perception_loss_weight (`float`, *optional*, defaults to `0.1`):
+            gamma Weight coefficient for the perception loss term. This encourages the model to be sensitive to visual
+            changes.
+
+        mask_ratio (`float`, *optional*, defaults to `0.3`):
+            Ratio of the image to mask when computing perception loss.
+
+        mask_type (`Literal["random", "patch", "grid"]`, *optional*, defaults to `"random"`):
+            Type of masking strategy to use.
+
+        der_loss_weight1 (`float`, *optional*, defaults to `0.03`):
+            eta1 Weight coefficient for the Double Entropy Regularization (DER) term. This term encourages confident
+            predictions with original images (low entropy) and uncertain predictions with masked images (high entropy).
+
+        der_loss_weight2 (`float`, *optional*, defaults to `0.03`):
+            eta2 Weight coefficient for the Double Entropy Regularization (DER) term. This term encourages confident
+            predictions with original images (low entropy) and uncertain predictions with masked images (high entropy).
+
+        loss_type (`Literal["grpo", "dapo"]`, inherited from GRPOConfig):
+            Base loss type to use. Set to "grpo" for PAPO-G or "dapo" for PAPO-D.
+    """
+
+    perception_loss_weight: float = 0.1
+    mask_ratio: float = 0.3
+    mask_type: Literal["random", "patch", "grid"] = "random"
+
+    # Added for Double Entropy Regularization
+    der_loss_weight1: float = 0.03
+    der_loss_weight2: float = 0.03
+
+    def __post_init__(self):
+        super().__post_init__()
+
+        # Validation
+        if not 0.0 <= self.mask_ratio <= 1.0:
+            raise ValueError(
+                f"mask_ratio must be between 0 and 1, got {self.mask_ratio}"
+            )
+
+        if self.der_loss_weight1 < 0 or self.der_loss_weight2 < 0:
+            raise ValueError(
+                f"der_loss_weight1 and der_loss_weight2 must be non-negative, got {self.der_loss_weight1} and {self.der_loss_weight2}"
+            )
+
+        if self.mask_type not in ["random", "patch", "grid"]:
+            raise ValueError(
+                f"mask_type must be one of ['random', 'patch', 'grid'], got {self.mask_type}"
+            )
diff --git a/src/aixpert/training/training/trl/experimental/papo/papo_trainer.py b/src/aixpert/training/training/trl/experimental/papo/papo_trainer.py
new file mode 100644
index 0000000..c03c5af
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/papo/papo_trainer.py
@@ -0,0 +1,417 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import textwrap
+
+import torch
+from datasets import Dataset, IterableDataset
+from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin
+
+from ...trainer.grpo_trainer import GRPOTrainer, RewardFunc
+from ...trainer.utils import nanmax, nanmin
+from .papo_config import PAPOConfig
+
+
+class PAPOTrainer(GRPOTrainer):
+    """
+    Trainer for Perception-Aware Policy Optimization (PAPO).
+
+    PAPO extends GRPO/DAPO for multimodal reasoning by adding an implicit perception loss that encourages the model to
+    better utilize visual information. The key innovation is computing KL divergence between model outputs on original
+    vs. corrupted (masked) images.
+
+    Two variants are supported:
+    - PAPO-G: PAPO + GRPO (use loss_type="grpo")
+    - PAPO-D: PAPO + DAPO (use loss_type="dapo")
+
+    Example:
+
+    ```python
+    from datasets import load_dataset
+    from trl import PAPOTrainer, PAPOConfig
+
+    dataset = load_dataset("your-vlm-dataset", split="train")
+
+
+    def reward_func(completions, **kwargs):
+        # Your reward function for multimodal reasoning
+        return [compute_reward(c) for c in completions]
+
+
+    # PAPO-G
+    config = PAPOConfig(
+        loss_type="grpo",  # Use GRPO as base
+        perception_loss_weight=0.1,
+        mask_ratio=0.3,
+    )
+
+    # PAPO-G
+    config = PAPOConfig(
+        loss_type="dapo",  # Use DAPO as base
+        perception_loss_weight=0.1,
+        mask_ratio=0.3,
+    )
+
+    trainer = PAPOTrainer(
+        model="Qwen/Qwen2-VL-2B-Instruct",
+        reward_funcs=reward_func,
+        args=config,
+        train_dataset=dataset,
+    )
+
+    trainer.train()
+    ```
+
+    Args:
+        model (`Union[str, PreTrainedModel]`):
+            Model to be trained (must be a vision-language model).
+        reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
+            Reward functions for computing rewards (same as GRPO).
+        args ([`PAPOConfig`], *optional*, defaults to `None`):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. Must include "prompt" and "image" columns.
+        eval_dataset: Same requirements as train_dataset.
+        processing_class: Processing class (tokenizer/processor) for the model.
+        reward_processing_classes: Processing classes for reward models.
+        callbacks: Training callbacks.
+        optimizers: Optimizer and scheduler tuple.
+        peft_config: PEFT configuration if using parameter-efficient fine-tuning.
+    """
+
+    _tag_names = ["trl", "papo"]
+    _name = "PAPO"
+    _paper = {
+        "title": "Perception-Aware Policy Optimization for Multimodal Reasoning",
+        "id": "2507.06448",
+        # docstyle-ignore
+        "citation": textwrap.dedent(
+            """\
+            @misc{wang2025perceptionawarepolicyoptimizationmultimodal,
+                title        = {{Perception-Aware Policy Optimization for Multimodal Reasoning}},
+                author       = {Zhenhailong Wang and Xuehang Guo and Sofia Stoica and Haiyang Xu and Hongru Wang and Hyeonjeong Ha and Xiusi Chen and Yangyi Chen and Ming Yan and Fei Huang and Heng Ji},
+                year         = 2025,
+                url          = {https://arxiv.org/abs/2507.06448},
+                archivePrefix= {arXiv},
+                eprint       = {2507.06448},
+                primaryClass = {cs.CL}
+            }"""
+        ),
+    }
+
+    def __init__(
+        self,
+        model: str | PreTrainedModel,
+        reward_funcs: RewardFunc | list[RewardFunc],
+        args: PAPOConfig | None = None,
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset
+        | IterableDataset
+        | dict[str, Dataset | IterableDataset]
+        | None = None,
+        processing_class: PreTrainedTokenizerBase | ProcessorMixin | None = None,
+        reward_processing_classes: PreTrainedTokenizerBase
+        | list[PreTrainedTokenizerBase]
+        | None = None,
+        callbacks=None,
+        optimizers=(None, None),
+        peft_config=None,
+    ):
+        # Initialize with default PAPO config if not provided
+        if args is None:
+            model_name = model if isinstance(model, str) else model.config._name_or_path
+            model_name = model_name.split("/")[-1]
+            args = PAPOConfig(f"{model_name}-PAPO")
+
+        # Store PAPO-specific parameters
+        self.perception_loss_weight = args.perception_loss_weight
+        self.mask_ratio = args.mask_ratio
+        self.mask_type = args.mask_type
+        self.der_loss_weight1 = args.der_loss_weight1
+        self.der_loss_weight2 = args.der_loss_weight2
+
+        # Initialize parent GRPO trainer
+        super().__init__(
+            model=model,
+            reward_funcs=reward_funcs,
+            args=args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            reward_processing_classes=reward_processing_classes,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            peft_config=peft_config,
+        )
+
+    def _mask_image(
+        self, pixel_values: torch.Tensor, mask_ratio: float = None
+    ) -> torch.Tensor:
+        """
+        Apply masking to image pixel values.
+
+        Args:
+            pixel_values: Image tensor of shape (B, C, H, W) or (B, N, C, H, W) for multi-image
+            mask_ratio: Ratio of image to mask (defaults to self.mask_ratio)
+
+        Returns
+        -------
+            Masked pixel values tensor
+        """
+        if mask_ratio is None:
+            mask_ratio = self.mask_ratio
+
+        masked_pixel_values = pixel_values.clone()
+
+        if self.mask_type == "random":
+            # Random pixel masking
+            mask = torch.rand_like(pixel_values) > mask_ratio
+            masked_pixel_values = masked_pixel_values * mask
+
+        elif self.mask_type == "patch":
+            # Patch-based masking (mask contiguous regions)
+            B = pixel_values.shape[0]
+            if pixel_values.ndim == 4:  # (B, C, H, W)
+                C, H, W = pixel_values.shape[1:]
+                for i in range(B):
+                    # Calculate patch size to mask
+                    patch_h = int(H * mask_ratio**0.5)
+                    patch_w = int(W * mask_ratio**0.5)
+                    # Random starting position
+                    start_h = random.randint(0, max(0, H - patch_h))
+                    start_w = random.randint(0, max(0, W - patch_w))
+                    # Apply mask
+                    masked_pixel_values[
+                        i, :, start_h : start_h + patch_h, start_w : start_w + patch_w
+                    ] = 0
+
+            elif pixel_values.ndim == 5:  # (B, N, C, H, W) for multi-image
+                N, C, H, W = pixel_values.shape[1:]
+                for i in range(B):
+                    for n in range(N):
+                        patch_h = int(H * mask_ratio**0.5)
+                        patch_w = int(W * mask_ratio**0.5)
+                        start_h = random.randint(0, max(0, H - patch_h))
+                        start_w = random.randint(0, max(0, W - patch_w))
+                        masked_pixel_values[
+                            i,
+                            n,
+                            :,
+                            start_h : start_h + patch_h,
+                            start_w : start_w + patch_w,
+                        ] = 0
+
+        elif self.mask_type == "grid":
+            # Grid-based masking (mask regular grid cells)
+            if pixel_values.ndim == 4:
+                C, H, W = pixel_values.shape[1:]
+                grid_size = int((1 / mask_ratio) ** 0.5)
+                cell_h, cell_w = H // grid_size, W // grid_size
+
+                for i in range(grid_size):
+                    for j in range(grid_size):
+                        if random.random() < mask_ratio:
+                            masked_pixel_values[
+                                :,
+                                :,
+                                i * cell_h : (i + 1) * cell_h,
+                                j * cell_w : (j + 1) * cell_w,
+                            ] = 0
+
+        return masked_pixel_values
+
+    def _compute_loss(self, model, inputs):
+        # >>> 1. GRPO loss
+        # Compute the per-token log probabilities for the model
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = (
+            inputs["completion_ids"],
+            inputs["completion_mask"],
+        )
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        logits_to_keep = completion_ids.size(
+            1
+        )  # we only need to compute the logits for the completion tokens
+
+        # Compute the per_token_logps and the entropy at each position in the completion
+        per_token_logps, entropies = self._get_per_token_logps_and_entropies(
+            model,
+            input_ids,
+            attention_mask,
+            logits_to_keep,
+            compute_entropy=True,
+            pixel_values=inputs.get("pixel_values"),
+            image_grid_thw=inputs.get("image_grid_thw"),
+            num_images=inputs.get("num_images"),
+            pixel_attention_mask=inputs.get("pixel_attention_mask"),
+            image_sizes=inputs.get("image_sizes"),
+        )
+
+        if self.top_entropy_quantile < 1.0:
+            entropy_mask = self.get_high_entropy_mask(
+                entropies, completion_mask, 1 - self.top_entropy_quantile
+            )
+        else:
+            entropy_mask = None
+
+        # Compute the KL divergence between the model and the reference model
+        if self.beta != 0.0:
+            ref_per_token_logps = inputs["ref_per_token_logps"]
+            per_token_kl = (
+                torch.exp(ref_per_token_logps - per_token_logps)
+                - (ref_per_token_logps - per_token_logps)
+                - 1
+            )
+
+        # Compute the loss
+        advantages = inputs["advantages"]
+        # When using num_iterations == 1 and steps_per_generation <= gradient_accumulation_steps
+        # old_per_token_logps == per_token_logps, so we can skip it's computation
+        # (see _generate_and_score_completions) and use per_token_logps.detach() instead.
+        old_per_token_logps = inputs.get("old_per_token_logps")
+        old_per_token_logps = (
+            per_token_logps.detach()
+            if old_per_token_logps is None
+            else old_per_token_logps
+        )
+
+        log_ratio = per_token_logps - old_per_token_logps
+        if self.importance_sampling_level == "token":
+            log_importance_weights = log_ratio
+        elif self.importance_sampling_level == "sequence":
+            log_importance_weights = (log_ratio * completion_mask).sum(
+                -1
+            ) / completion_mask.sum(-1).clamp(min=1.0)
+            log_importance_weights = log_importance_weights.unsqueeze(-1)
+        else:
+            raise ValueError(
+                f"Unknown importance sampling level: {self.importance_sampling_level}. Possible values are 'token' "
+                "and 'sequence'."
+            )
+        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
+        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
+
+        coef_1 = torch.exp(log_importance_weights)
+        coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
+
+        # Two-sided clipping
+        if self.args.delta is not None:
+            coef_1 = torch.clamp(coef_1, max=self.args.delta)
+
+        per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+        per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        if entropy_mask is not None:
+            per_token_loss = per_token_loss * entropy_mask
+        if self.beta != 0.0:
+            per_token_loss = per_token_loss + self.beta * per_token_kl
+
+        if self.loss_type == "grpo":
+            loss = (
+                (per_token_loss * completion_mask).sum(-1)
+                / completion_mask.sum(-1).clamp(min=1.0)
+            ).mean()
+            loss = loss / self.current_gradient_accumulation_steps
+        elif self.loss_type == "dapo":
+            normalizer = inputs["num_items_in_batch"] / self.accelerator.num_processes
+            loss = (per_token_loss * completion_mask).sum() / normalizer
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}")
+        # >>> 2. Implicit Perception Loss
+        inputs["pixel_values"] = self._mask_image(
+            inputs["pixel_values"], self.mask_ratio
+        )
+        mask_img_per_token_logps, mask_img_entropies = (
+            self._get_per_token_logps_and_entropies(
+                model,
+                input_ids,
+                attention_mask,
+                logits_to_keep,
+                compute_entropy=True,
+                pixel_values=inputs.get("pixel_values"),
+                image_grid_thw=inputs.get("image_grid_thw"),
+                num_images=inputs.get("num_images"),
+                pixel_attention_mask=inputs.get("pixel_attention_mask"),
+                image_sizes=inputs.get("image_sizes"),
+            )
+        )
+        perception_kl = (
+            torch.exp(mask_img_per_token_logps - per_token_logps)
+            - (mask_img_per_token_logps - per_token_logps)
+            - 1
+        )
+        perception_kl = torch.clamp(perception_kl, min=0.0, max=0.2)
+        perception_loss = self.perception_loss_weight * perception_kl
+
+        # >>> 3. Double Entropy Loss
+        der_loss = (
+            self.der_loss_weight1 * entropies
+            + self.der_loss_weight2 * mask_img_entropies
+        )
+
+        # PAPO Loss
+        loss = (loss - perception_loss + der_loss).mean()
+        # Log the metrics
+        mode = "train" if self.model.training else "eval"
+
+        completion_token_count = completion_mask.sum().clamp(min=1.0)
+
+        def masked_batch_mean(x):
+            if x.shape[1] == 1:  # when importance_sampling_level == "sequence"
+                return x.mean()
+            return (x * completion_mask).sum() / completion_token_count
+
+        if self.beta != 0.0:
+            mean_kl = masked_batch_mean(per_token_kl)
+            self._metrics[mode]["kl"].append(
+                self.accelerator.gather(mean_kl).nanmean().item()
+            )
+
+        mean_entropy = masked_batch_mean(entropies)
+        self._metrics[mode]["entropy"].append(
+            self.accelerator.gather(mean_entropy).nanmean().item()
+        )
+
+        # Compute the clipped probability ratios
+        is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages.unsqueeze(1) < 0)
+        is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (
+            advantages.unsqueeze(1) > 0
+        )
+        is_region_clipped = is_low_clipped | is_high_clipped
+
+        low_clip = masked_batch_mean(is_low_clipped.float())
+        high_clip = masked_batch_mean(is_high_clipped.float())
+        clip_ratio = masked_batch_mean(is_region_clipped.float())
+
+        gathered_low_clip = self.accelerator.gather(low_clip)
+        self._metrics[mode]["clip_ratio/low_mean"].append(
+            gathered_low_clip.nanmean().item()
+        )
+        self._metrics[mode]["clip_ratio/low_min"].append(
+            nanmin(gathered_low_clip).item()
+        )
+        gathered_high_clip = self.accelerator.gather(high_clip)
+        self._metrics[mode]["clip_ratio/high_mean"].append(
+            gathered_high_clip.nanmean().item()
+        )
+        self._metrics[mode]["clip_ratio/high_max"].append(
+            nanmax(gathered_high_clip).item()
+        )
+        gathered_clip_ratio = self.accelerator.gather(clip_ratio)
+        self._metrics[mode]["clip_ratio/region_mean"].append(
+            gathered_clip_ratio.nanmean().item()
+        )
+        return loss
diff --git a/src/aixpert/training/training/trl/experimental/xpo/__init__.py b/src/aixpert/training/training/trl/experimental/xpo/__init__.py
new file mode 100644
index 0000000..ca4a4a6
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/xpo/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .xpo_config import XPOConfig
+from .xpo_trainer import XPOTrainer
+
+
+__all__ = ["XPOConfig", "XPOTrainer"]
diff --git a/src/aixpert/training/training/trl/experimental/xpo/xpo_config.py b/src/aixpert/training/training/trl/experimental/xpo/xpo_config.py
new file mode 100644
index 0000000..5fde2f2
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/xpo/xpo_config.py
@@ -0,0 +1,45 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from ...trainer.online_dpo_config import OnlineDPOConfig
+
+
+@dataclass
+class XPOConfig(OnlineDPOConfig):
+    r"""
+    Configuration class for the [`experimental.xpo.XPOTrainer`].
+
+    Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
+
+    Parameters
+    ----------
+        alpha (`float` or `list[float]`, *optional*, defaults to `1e-5`):
+            Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch
+            and the last alpha is used for the rest of the epochs.
+    """
+
+    alpha: list[float] = field(
+        default_factory=lambda: [1e-5],
+        metadata={
+            "help": "Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each "
+            "new epoch and the last alpha is used for the rest of the epochs."
+        },
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+        if hasattr(self.alpha, "__len__") and len(self.alpha) == 1:
+            self.alpha = self.alpha[0]
diff --git a/src/aixpert/training/training/trl/experimental/xpo/xpo_trainer.py b/src/aixpert/training/training/trl/experimental/xpo/xpo_trainer.py
new file mode 100644
index 0000000..3e7c00c
--- /dev/null
+++ b/src/aixpert/training/training/trl/experimental/xpo/xpo_trainer.py
@@ -0,0 +1,650 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import textwrap
+from collections.abc import Callable
+from typing import Any
+
+import jinja2
+import torch
+import torch.nn.functional as F
+from datasets import Dataset, IterableDataset
+from torch import nn
+from transformers import (
+    BaseImageProcessor,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainerCallback,
+)
+from transformers.trainer_utils import EvalPrediction
+from transformers.training_args import OptimizerNames
+from transformers.utils import is_peft_available
+
+from ...data_utils import is_conversational, maybe_apply_chat_template
+from ...models.utils import unwrap_model_for_generation
+from ...trainer.judges import BasePairwiseJudge
+from ...trainer.online_dpo_trainer import OnlineDPOTrainer
+from ...trainer.utils import (
+    SIMPLE_CHAT_TEMPLATE,
+    empty_cache,
+    get_reward,
+    selective_log_softmax,
+    truncate_right,
+)
+from .xpo_config import XPOConfig
+
+
+if is_peft_available():
+    from peft import PeftModel
+
+
+class XPOTrainer(OnlineDPOTrainer):
+    """
+    Trainer for Exploratory Preference Optimization (XPO).
+
+    It is implemented as a subclass of [`OnlineDPOTrainer`].
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an `AutoModelForCausalLM`.
+        ref_model ([`PreTrainedModelWrapper`]):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        reward_funcs ([`~transformers.PreTrainedModel`]):
+            The reward model to score completions with, preferably an
+            [`~transformers.AutoModelForSequenceClassification`].
+        judge ([`experimental.judges.BasePairwiseJudge`]):
+            The judge to use for pairwise comparison of model completions.
+        args ([`experimental.xpo.XPOConfig`]):
+            The XPO config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        peft_config (`dict`):
+            The peft config to use for training.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+    """
+
+    _tag_names = ["trl", "xpo"]
+    _name = "XPO"
+    _paper = {
+        "title": "Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF",
+        "id": "2405.21046",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{jung2024binary,
+                title        = {{Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF}},
+                author       = {Tengyang Xie and Dylan J. Foster and Akshay Krishnamurthy and Corby Rosset and Ahmed Awadallah and Alexander Rakhlin},
+                year         = 2024,
+                eprint       = {arXiv:2405.21046}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module = None,
+        ref_model: PreTrainedModel | nn.Module = None,
+        reward_funcs: nn.Module | None = None,
+        judge: BasePairwiseJudge | None = None,
+        args: XPOConfig | None = None,
+        data_collator: Callable | None = None,
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        reward_processing_classes: PreTrainedTokenizerBase
+        | list[PreTrainedTokenizerBase]
+        | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+    ) -> None:
+        super().__init__(
+            model=model,
+            ref_model=ref_model,
+            judge=judge,
+            reward_funcs=reward_funcs,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            reward_processing_classes=reward_processing_classes,
+            peft_config=peft_config,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        self._alpha = self.args.alpha
+
+        # Overwrite the stats dictionary to include XPO specific statistics
+        self.stats = {
+            # Remove "non_score_reward", "rlhf_reward", "scores"
+            # Add "loss/dpo", "loss/xpo"
+            "loss/dpo": [],
+            "loss/xpo": [],
+            "objective/kl": [],
+            "objective/entropy": [],
+            "rewards/chosen": [],
+            "rewards/rejected": [],
+            "rewards/accuracies": [],
+            "rewards/margins": [],
+            "logps/chosen": [],
+            "logps/rejected": [],
+            # Replace "contain_eos_token" by "model_contain_eos_token" and "ref_contain_eos_token"
+            "val/model_contain_eos_token": [],
+            "val/ref_contain_eos_token": [],
+            "alpha": [],
+            "beta": [],
+        }
+        if self.reward_funcs is not None:
+            if len(self.reward_funcs) != 1:
+                raise ValueError("XPOTrainer only supports one reward function/model.")
+            self.reward_funcs = self.reward_funcs[0]
+            self.stats["objective/model_scores"] = []
+            self.stats["objective/ref_scores"] = []
+            self.stats["objective/scores_margin"] = []
+
+    @property
+    def alpha(self):
+        if isinstance(self._alpha, list):
+            epoch = self.state.epoch
+            return self._alpha[epoch] if epoch < len(self._alpha) else self._alpha[-1]
+        return self._alpha
+
+    def _generate_completions(self, prompts, model):
+        with unwrap_model_for_generation(
+            model, self.accelerator
+        ) as unwrapped_policy_model_for_gen:
+            model_output = unwrapped_policy_model_for_gen.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        actual_model_for_ref_generation: torch.nn.Module
+        if self.ref_model is None:
+            unwrapped_main_model_for_ref_logic = self.accelerator.unwrap_model(model)
+
+            if is_peft_available() and isinstance(
+                unwrapped_main_model_for_ref_logic, PeftModel
+            ):
+                actual_model_for_ref_generation = (
+                    unwrapped_main_model_for_ref_logic.get_base_model()
+                )
+            else:
+                actual_model_for_ref_generation = unwrapped_main_model_for_ref_logic
+        else:
+            actual_model_for_ref_generation = self.accelerator.unwrap_model(
+                self.ref_model
+            )
+
+        with unwrap_model_for_generation(
+            actual_model_for_ref_generation, self.accelerator
+        ) as final_ref_model_for_gen:
+            ref_output = final_ref_model_for_gen.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        return model_output, ref_output
+
+    def _process_completions(self, model_output, ref_output, prompts):
+        context_length = prompts["input_ids"].shape[1]
+
+        # Process model completions
+        model_completion_ids = model_output[:, context_length:]
+        model_completion_ids, model_completion_mask = truncate_right(
+            model_completion_ids,
+            self.processing_class.eos_token_id,
+            self.processing_class.pad_token_id,
+        )
+        model_data = {
+            "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
+            "attention_mask": torch.cat(
+                (prompts["attention_mask"], model_completion_mask), dim=1
+            ),
+            "raw": prompts["raw"],
+        }
+
+        # Process reference model completions
+        ref_completion_ids = ref_output[:, context_length:]
+        ref_completion_ids, ref_completion_mask = truncate_right(
+            ref_completion_ids,
+            self.processing_class.eos_token_id,
+            self.processing_class.pad_token_id,
+        )
+        ref_data = {
+            "input_ids": torch.cat((prompts["input_ids"], ref_completion_ids), dim=1),
+            "attention_mask": torch.cat(
+                (prompts["attention_mask"], ref_completion_mask), dim=1
+            ),
+            "raw": prompts["raw"],
+        }
+
+        return model_data, ref_data
+
+    def _compute_rewards(self, model_data, ref_data, context_length):
+        with torch.no_grad():
+            _, model_scores, _ = get_reward(
+                self.reward_funcs,
+                model_data["input_ids"],
+                self.processing_class.pad_token_id,
+                context_length,
+            )
+            _, ref_scores, _ = get_reward(
+                self.reward_funcs,
+                ref_data["input_ids"],
+                self.processing_class.pad_token_id,
+                context_length,
+            )
+
+        # Apply EOS penalty if needed
+        if self.args.missing_eos_penalty is not None:
+            model_contain_eos = torch.any(
+                model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1
+            )
+            ref_contain_eos = torch.any(
+                ref_data["input_ids"] == self.processing_class.eos_token_id, dim=-1
+            )
+            model_scores[~model_contain_eos] -= self.args.missing_eos_penalty
+            ref_scores[~ref_contain_eos] -= self.args.missing_eos_penalty
+
+        return model_scores, ref_scores
+
+    def _compute_judge(self, model_data, ref_data, context_length):
+        prompts = model_data["raw"]
+        model_data_completions = self.processing_class.batch_decode(
+            model_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        model_data_completions = [
+            completion.strip() for completion in model_data_completions
+        ]
+
+        ref_data_completions = self.processing_class.batch_decode(
+            ref_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        ref_data_completions = [
+            completion.strip() for completion in ref_data_completions
+        ]
+
+        if is_conversational({"prompt": prompts[0]}):
+            model_data_completions = [
+                [{"role": "assistant", "content": completion}]
+                for completion in model_data_completions
+            ]
+            environment = jinja2.Environment()
+            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+            prompts = [template.render(messages=message) for message in prompts]
+            model_data_completions = [
+                template.render(messages=completion)
+                for completion in model_data_completions
+            ]
+
+            ref_data_completions = [
+                [{"role": "assistant", "content": completion}]
+                for completion in ref_data_completions
+            ]
+            ref_data_completions = [
+                template.render(messages=completion)
+                for completion in ref_data_completions
+            ]
+
+        ranks_of_first_completion = self.judge.judge(
+            prompts,
+            list(zip(model_data_completions, ref_data_completions, strict=True)),
+        )
+        # convert ranks to a True/False mask:
+        # when rank == 0, it means the first completion is the best
+        # when rank == 1, it means the second completion is the best
+        return torch.tensor(
+            [rank == 0 for rank in ranks_of_first_completion],
+            device=model_data["input_ids"].device,
+        )
+
+    def _compute_logprobs(self, model, model_data, ref_data, context_length):
+        def compute_logprobs_for_data(m, data):
+            output = m(data["input_ids"], attention_mask=data["attention_mask"])
+            logits = output.logits[:, context_length - 1 : -1]
+            token_logprobs = selective_log_softmax(
+                logits, data["input_ids"][:, context_length:]
+            )
+            return token_logprobs
+
+        # Compute logprobs for model completions
+        model_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+        # Compute logprobs for model on reference completions (for XPO loss)
+        model_logprobs_ref_data = compute_logprobs_for_data(model, ref_data)
+
+        # Compute logprobs for reference model completions
+        with torch.no_grad():
+            if self.ref_model is None:
+                with model.disable_adapter():
+                    ref_logprobs_model_data = compute_logprobs_for_data(
+                        model, model_data
+                    )
+                    ref_logprobs_ref_data = compute_logprobs_for_data(model, ref_data)
+            else:
+                ref_logprobs_model_data = compute_logprobs_for_data(
+                    self.ref_model, model_data
+                )
+                ref_logprobs_ref_data = compute_logprobs_for_data(
+                    self.ref_model, ref_data
+                )
+
+        # Mask padding tokens
+        model_padding_mask = model_data["attention_mask"][:, context_length:] == 0
+        ref_padding_mask = ref_data["attention_mask"][:, context_length:] == 0
+        model_logprobs_model_data = model_logprobs_model_data.masked_fill(
+            model_padding_mask, 0.0
+        )
+        model_logprobs_ref_data = model_logprobs_ref_data.masked_fill(
+            ref_padding_mask, 0.0
+        )
+        ref_logprobs_ref_data = ref_logprobs_ref_data.masked_fill(ref_padding_mask, 0.0)
+        ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(
+            model_padding_mask, 0.0
+        )
+
+        return (
+            model_logprobs_model_data,
+            model_logprobs_ref_data,
+            ref_logprobs_ref_data,
+            ref_logprobs_model_data,
+        )
+
+    def _compute_losses(
+        self,
+        model_logprobs_model_data,
+        model_logprobs_ref_data,
+        ref_logprobs_ref_data,
+        ref_logprobs_model_data,
+        chosen_mask,
+    ):
+        # Compute log probs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1)
+        ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+
+        chosen_model_logprobs = torch.where(
+            chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum
+        )
+        chosen_ref_logprobs = torch.where(
+            chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum
+        )
+        chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs
+
+        rejected_model_logprobs = torch.where(
+            ~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum
+        )
+        rejected_ref_logprobs = torch.where(
+            ~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum
+        )
+        rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs
+
+        # Compute logits as the difference between chosen and rejected log ratios
+        logits = chosen_log_ratios - rejected_log_ratios
+
+        if self.args.loss_type == "sigmoid":
+            dpo_losses = -F.logsigmoid(self.beta * logits)
+        elif self.args.loss_type == "ipo":
+            dpo_losses = (logits - 1 / (2 * self.beta)) ** 2
+        else:
+            raise NotImplementedError(f"invalid loss type {self.args.loss_type}")
+
+        # Compute XPO specific loss
+        xpo_losses = self.alpha * model_logprobs_ref_data_sum
+
+        # Total loss
+        loss = (dpo_losses + xpo_losses).mean()
+
+        return loss, dpo_losses, xpo_losses
+
+    def _log_statistics(
+        self,
+        model_data,
+        ref_data,
+        model_logprobs_model_data,
+        model_logprobs_ref_data,
+        ref_logprobs_ref_data,
+        ref_logprobs_model_data,
+        chosen_mask,
+        dpo_losses,
+        xpo_losses,
+        context_length,
+        model_scores=None,
+        ref_scores=None,
+    ):
+        # Helper function to gather and compute mean
+        def gather_mean(tensor):
+            return self.accelerator.gather_for_metrics(tensor).mean().item()
+
+        # Log losses
+        self.stats["loss/dpo"].append(gather_mean(dpo_losses))
+        self.stats["loss/xpo"].append(gather_mean(xpo_losses))
+
+        # Log scores
+        if self.reward_funcs is not None:
+            self.stats["objective/model_scores"].append(gather_mean(model_scores))
+            self.stats["objective/ref_scores"].append(gather_mean(ref_scores))
+            self.stats["objective/scores_margin"].append(
+                gather_mean(model_scores - ref_scores)
+            )
+
+        # Log logprobs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1)
+        ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+
+        chosen_model_logprobs = torch.where(
+            chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum
+        )
+        chosen_ref_logprobs = torch.where(
+            chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum
+        )
+        chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs
+
+        rejected_model_logprobs = torch.where(
+            ~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum
+        )
+        rejected_ref_logprobs = torch.where(
+            ~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum
+        )
+        rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs
+
+        self.stats["logps/chosen"].append(
+            gather_mean(chosen_model_logprobs.mean() + chosen_ref_logprobs.mean())
+        )
+        self.stats["logps/rejected"].append(
+            gather_mean(rejected_model_logprobs.mean() + rejected_ref_logprobs.mean())
+        )
+
+        # Log rewards
+        # Compute various statistics
+        chosen_rewards = chosen_log_ratios * self.beta
+        rejected_rewards = rejected_log_ratios * self.beta
+        self.stats["rewards/chosen"].append(gather_mean(chosen_rewards.mean()))
+        self.stats["rewards/rejected"].append(gather_mean(rejected_rewards.mean()))
+
+        # Calculate KL divergence for model and ref data
+        kl_model_data = model_logprobs_model_data - ref_logprobs_model_data
+        kl_ref_data = model_logprobs_ref_data - ref_logprobs_ref_data
+        mean_kl = (kl_model_data.sum(1) + kl_ref_data.sum(1)).mean() / 2
+        self.stats["objective/kl"].append(gather_mean(mean_kl))
+
+        # Calculate entropy for model and ref data
+        entropy_model_data = -model_logprobs_model_data.sum(1)
+        entropy_ref_data = -model_logprobs_ref_data.sum(1)
+        mean_entropy = (entropy_model_data.mean() + entropy_ref_data.mean()) / 2
+        self.stats["objective/entropy"].append(gather_mean(mean_entropy))
+
+        # Calculate margins
+        margin = chosen_rewards - rejected_rewards
+        self.stats["rewards/margins"].append(gather_mean(margin.mean()))
+
+        # Calculate accuracy
+        accuracy = (margin > 0).float()
+        self.stats["rewards/accuracies"].append(gather_mean(accuracy.mean()))
+
+        # Log EOS token statistics
+        model_eos = (
+            model_data["input_ids"][:, context_length:]
+            == self.processing_class.eos_token_id
+        ).any(dim=1)
+        ref_eos = (
+            ref_data["input_ids"][:, context_length:]
+            == self.processing_class.eos_token_id
+        ).any(dim=1)
+        self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float()))
+        self.stats["val/ref_contain_eos_token"].append(gather_mean(ref_eos.float()))
+
+        # Log alpha and beta
+        self.stats["alpha"].append(self.alpha)
+        self.stats["beta"].append(self.beta)
+
+    def training_step(
+        self,
+        model: nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        num_items_in_batch: int | None = None,
+    ) -> torch.Tensor:
+        model.train()
+
+        # Apply chat template and tokenize the input
+        batch_size = len(next(iter(inputs.values())))
+        prompts = inputs["prompt"]
+        inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
+        inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
+        inputs = [
+            self.tokenize_row(
+                x, self.model.config.is_encoder_decoder, self.processing_class
+            )
+            for x in inputs
+        ]
+        inputs = self.data_collator(inputs)
+
+        # need the prompt_ only
+        inputs = self._prepare_inputs(inputs)
+        context_length = inputs["prompt_input_ids"].shape[1]
+        prompts = {
+            "input_ids": inputs["prompt_input_ids"],
+            "attention_mask": inputs["prompt_attention_mask"],
+            "raw": prompts,
+        }
+        del inputs
+
+        # Sample completions from both the model and the reference model
+        model_output, ref_output = self._generate_completions(prompts, model)
+
+        # Process model completions
+        model_data, ref_data = self._process_completions(
+            model_output, ref_output, prompts
+        )
+
+        # Compute rewards
+        if self.reward_funcs is not None:
+            model_scores, ref_scores = self._compute_rewards(
+                model_data, ref_data, context_length
+            )
+            chosen_mask = model_scores >= ref_scores
+        else:
+            model_scores, ref_scores = None, None
+            chosen_mask = self._compute_judge(model_data, ref_data, context_length)
+
+        # Compute logprobs
+        (
+            model_logprobs_model_data,
+            model_logprobs_ref_data,
+            ref_logprobs_ref_data,
+            ref_logprobs_model_data,
+        ) = self._compute_logprobs(model, model_data, ref_data, context_length)
+
+        # Compute loss
+        loss, dpo_losses, xpo_losses = self._compute_losses(
+            model_logprobs_model_data,
+            model_logprobs_ref_data,
+            ref_logprobs_ref_data,
+            ref_logprobs_model_data,
+            chosen_mask,
+        )
+
+        # Log everything
+        self._log_statistics(
+            model_data,
+            ref_data,
+            model_logprobs_model_data.detach(),
+            model_logprobs_ref_data.detach(),
+            ref_logprobs_ref_data,
+            ref_logprobs_model_data,
+            chosen_mask,
+            dpo_losses.detach(),
+            xpo_losses.detach(),
+            context_length,
+            model_scores,
+            ref_scores,
+        )
+
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            empty_cache()
+
+        kwargs = {}
+        # For LOMO optimizers you need to explicitly use the learning rate
+        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            kwargs["learning_rate"] = self._get_learning_rate()
+
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+
+        self.accelerator.backward(loss, **kwargs)
+
+        return loss.detach() / self.args.gradient_accumulation_steps
diff --git a/src/aixpert/training/training/trl/extras/__init__.py b/src/aixpert/training/training/trl/extras/__init__.py
new file mode 100644
index 0000000..a317018
--- /dev/null
+++ b/src/aixpert/training/training/trl/extras/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/aixpert/training/training/trl/extras/dataset_formatting.py b/src/aixpert/training/training/trl/extras/dataset_formatting.py
new file mode 100644
index 0000000..4853624
--- /dev/null
+++ b/src/aixpert/training/training/trl/extras/dataset_formatting.py
@@ -0,0 +1,183 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import warnings
+from collections.abc import Callable
+from typing import Literal
+
+import datasets
+from datasets import Dataset, Value
+from packaging import version
+from transformers import AutoTokenizer
+
+
+if version.parse(datasets.__version__) >= version.parse("4.0.0"):
+    from datasets import List
+
+    FORMAT_MAPPING = {
+        "chatml": List(
+            {
+                "content": Value(dtype="string", id=None),
+                "role": Value(dtype="string", id=None),
+            }
+        ),
+        "instruction": {
+            "completion": Value(dtype="string", id=None),
+            "prompt": Value(dtype="string", id=None),
+        },
+    }
+else:
+    FORMAT_MAPPING = {
+        "chatml": [
+            {
+                "content": Value(dtype="string", id=None),
+                "role": Value(dtype="string", id=None),
+            }
+        ],
+        "instruction": {
+            "completion": Value(dtype="string", id=None),
+            "prompt": Value(dtype="string", id=None),
+        },
+    }
+
+
+def conversations_formatting_function(
+    tokenizer: AutoTokenizer,
+    messages_field: Literal["messages", "conversations"],
+    tools: list | None = None,
+):
+    r"""
+    Return a callable function that takes in a "messages" dataset and returns a formatted dataset, based on the
+    tokenizer apply chat template to the dataset along with the schema of the list of functions in the tools list.
+
+    <Deprecated version="0.24.0">
+
+    `conversations_formatting_function` is deprecated and will be removed in version 0.27. Please use
+    `tokenizer.apply_chat_template()` directly instead.
+
+    </Deprecated>
+    """
+    warnings.warn(
+        "`conversations_formatting_function` is deprecated and will be removed in TRL 0.27. "
+        "Please use `tokenizer.apply_chat_template()` directly instead.",
+        FutureWarning,
+        stacklevel=2,
+    )
+
+    def format_dataset(examples):
+        if isinstance(examples[messages_field][0], list):
+            output_texts = []
+            for i in range(len(examples[messages_field])):
+                output_texts.append(
+                    tokenizer.apply_chat_template(
+                        examples[messages_field][i], tokenize=False, tools=tools
+                    )
+                )
+            return output_texts
+        return tokenizer.apply_chat_template(
+            examples[messages_field], tokenize=False, tools=tools
+        )
+
+    return format_dataset
+
+
+def instructions_formatting_function(tokenizer: AutoTokenizer):
+    r"""
+    Return a callable function that takes in an "instructions" dataset and returns a formatted dataset, based on the
+    tokenizer apply chat template to the dataset
+
+    <Deprecated version="0.24.0">
+
+    `instructions_formatting_function` is deprecated and will be removed in version 0.27. Please use
+    `tokenizer.apply_chat_template()` directly instead.
+
+    </Deprecated>
+    """
+    warnings.warn(
+        "`instructions_formatting_function` is deprecated and will be removed in TRL 0.27. "
+        "Please use `tokenizer.apply_chat_template()` directly instead.",
+        FutureWarning,
+        stacklevel=2,
+    )
+
+    def format_dataset(examples):
+        if isinstance(examples["prompt"], list):
+            output_texts = []
+            for i in range(len(examples["prompt"])):
+                converted_sample = [
+                    {"role": "user", "content": examples["prompt"][i]},
+                    {"role": "assistant", "content": examples["completion"][i]},
+                ]
+                output_texts.append(
+                    tokenizer.apply_chat_template(converted_sample, tokenize=False)
+                )
+            return output_texts
+        converted_sample = [
+            {"role": "user", "content": examples["prompt"]},
+            {"role": "assistant", "content": examples["completion"]},
+        ]
+        return tokenizer.apply_chat_template(converted_sample, tokenize=False)
+
+    return format_dataset
+
+
+def get_formatting_func_from_dataset(
+    dataset: Dataset, tokenizer: AutoTokenizer, tools: list | None = None
+) -> Callable | None:
+    r"""
+    Finds the correct formatting function based on the dataset structure. Currently supported datasets are:
+    - `ChatML` with [{"role": str, "content": str}]
+    - `instruction` with [{"prompt": str, "completion": str}]
+
+    Args:
+        dataset (Dataset): User dataset
+        tokenizer (AutoTokenizer): Tokenizer used for formatting
+        tools (list, *optional*): List of tools (callable functions) that will be accessible to the model.
+            If the template does not support function calling, this argument will have no effect.
+
+    Returns
+    -------
+        Callable: Formatting function if the dataset format is supported else None
+
+    <Deprecated version="0.24.0">
+
+    `get_formatting_func_from_dataset` is deprecated and will be removed in version 0.27. Please use
+    `tokenizer.apply_chat_template()` directly instead.
+
+    </Deprecated>
+    """
+    warnings.warn(
+        "`get_formatting_func_from_dataset` is deprecated and will be removed in TRL 0.27. "
+        "Please use `tokenizer.apply_chat_template()` directly instead.",
+        FutureWarning,
+        stacklevel=2,
+    )
+
+    if isinstance(dataset, Dataset):
+        if "messages" in dataset.features:
+            if dataset.features["messages"] == FORMAT_MAPPING["chatml"]:
+                logging.info("Formatting dataset with chatml format")
+                return conversations_formatting_function(tokenizer, "messages", tools)
+        if "conversations" in dataset.features:
+            if dataset.features["conversations"] == FORMAT_MAPPING["chatml"]:
+                logging.info("Formatting dataset with chatml format")
+                return conversations_formatting_function(
+                    tokenizer, "conversations", tools
+                )
+        elif dataset.features == FORMAT_MAPPING["instruction"]:
+            logging.info("Formatting dataset with instruction format")
+            return instructions_formatting_function(tokenizer)
+
+    return None
diff --git a/src/aixpert/training/training/trl/extras/profiling.py b/src/aixpert/training/training/trl/extras/profiling.py
new file mode 100644
index 0000000..928b330
--- /dev/null
+++ b/src/aixpert/training/training/trl/extras/profiling.py
@@ -0,0 +1,110 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import functools
+import time
+from collections.abc import Callable, Generator
+
+from transformers import Trainer
+from transformers.integrations import is_mlflow_available, is_wandb_available
+
+
+if is_wandb_available():
+    import wandb
+
+if is_mlflow_available():
+    import mlflow
+
+
+@contextlib.contextmanager
+def profiling_context(trainer: Trainer, name: str) -> Generator[None, None, None]:
+    """
+    A context manager function for profiling a block of code. Results are logged to Weights & Biases or MLflow
+    depending on the trainer's configuration.
+
+    Args:
+        trainer (`~transformers.Trainer`):
+            Trainer object.
+        name (`str`):
+            Name of the block to be profiled. Used as a key in the logged dictionary.
+
+    Example:
+    ```python
+    from transformers import Trainer
+    from trl.extras.profiling import profiling_context
+
+
+    class MyTrainer(Trainer):
+        def some_method(self):
+            A = np.random.rand(1000, 1000)
+            B = np.random.rand(1000, 1000)
+            with profiling_context(self, "matrix_multiplication"):
+                # Code to profile: simulate a computationally expensive operation
+                result = A @ B  # Matrix multiplication
+    ```
+    """
+    start_time = time.perf_counter()
+    yield
+    end_time = time.perf_counter()
+    duration = end_time - start_time
+
+    profiling_metrics = {
+        f"profiling/Time taken: {trainer.__class__.__name__}.{name}": duration
+    }
+    if (
+        "wandb" in trainer.args.report_to
+        and wandb.run is not None
+        and trainer.accelerator.is_main_process
+    ):
+        wandb.log(profiling_metrics)
+
+    if (
+        "mlflow" in trainer.args.report_to
+        and mlflow.run is not None
+        and trainer.accelerator.is_main_process
+    ):
+        mlflow.log_metrics(profiling_metrics, step=trainer.state.global_step)
+
+
+def profiling_decorator(func: Callable) -> Callable:
+    """
+    Decorator to profile a function and log execution time using [`extras.profiling.profiling_context`].
+
+    Args:
+        func (`Callable`):
+            Function to be profiled.
+
+    Example:
+    ```python
+    from transformers import Trainer
+    from trl.extras.profiling import profiling_decorator
+
+
+    class MyTrainer(Trainer):
+        @profiling_decorator
+        def some_method(self):
+            A = np.random.rand(1000, 1000)
+            B = np.random.rand(1000, 1000)
+            # Code to profile: simulate a computationally expensive operation
+            result = A @ B
+    ```
+    """
+
+    @functools.wraps(func)
+    def wrapper(self, *args, **kwargs):
+        with profiling_context(self, func.__name__):
+            return func(self, *args, **kwargs)
+
+    return wrapper
diff --git a/src/aixpert/training/training/trl/extras/vllm_client.py b/src/aixpert/training/training/trl/extras/vllm_client.py
new file mode 100644
index 0000000..2b42f6f
--- /dev/null
+++ b/src/aixpert/training/training/trl/extras/vllm_client.py
@@ -0,0 +1,543 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import atexit
+import base64
+import copy
+import logging
+import socket
+import time
+from io import BytesIO
+from urllib.parse import urlparse
+
+import torch
+import torch.distributed.distributed_c10d as c10d
+from torch import nn
+from transformers import is_torch_xpu_available
+
+from ..import_utils import (
+    is_requests_available,
+    is_vllm_ascend_available,
+    is_vllm_available,
+)
+
+
+if is_requests_available():
+    import requests
+    from requests import ConnectionError
+
+
+if is_vllm_available():
+    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+    from vllm.distributed.utils import StatelessProcessGroup
+
+    if is_vllm_ascend_available():
+        from vllm_ascend.distributed.device_communicators.pyhccl import (
+            PyHcclCommunicator as PyNcclCommunicator,
+        )
+
+
+logger = logging.getLogger(__name__)
+
+
+def pil_to_base64(image):
+    buffer = BytesIO()
+    image.save(buffer, format="PNG")
+    img_bytes = buffer.getvalue()
+    return base64.b64encode(img_bytes).decode("utf-8")
+
+
+class VLLMClient:
+    """
+    A client class to interact with a vLLM server.
+
+    This class provides methods to generate completions, initialize and manage weight update groups, and update model
+    weights in a distributed setting. Before using it, start the vLLM server with `trl vllm-serve`.
+
+    Args:
+        base_url (`str`, *optional*):
+            Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `host` and `server_port` are
+            ignored.
+        host (`str`, *optional*, defaults to `"0.0.0.0"`):
+            IP address of the vLLM server. Ignored if `base_url` is provided.
+        server_port (`int`, *optional*, defaults to `8000`):
+            Port number of the vLLM server. Ignored if `base_url` is provided.
+        group_port (`int`, *optional*, defaults to `51216`):
+            Port number for the weight update group.
+        connection_timeout (`float`, *optional*, defaults to `0.0`):
+            Total timeout duration in seconds to wait for the server to be up. If the server is not up after the
+            timeout, a `ConnectionError` is raised.
+
+    Examples
+    --------
+        Run the vLLM server with the model `Qwen/Qwen2.5-7B`:
+
+        ```
+        $ trl vllm-serve --model Qwen/Qwen2.5-7B
+        ...
+        INFO:     Application startup complete.
+        INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+        ```
+
+        Use the client to generate completions and update model weights:
+
+        ```python
+        >>> from trl.extras.vllm_client import VLLMClient
+
+        >>> client = VLLMClient()
+        >>> client.generate(["Hello, AI!", "Tell me a joke"])
+        {'prompt_ids': [[9707, 11, 15235, 0],
+                        [40451, 752, 264, 21646]],
+         'completion_ids': [[11479, 752, 5046, 279, 1465, 304, 419, 23670, 2038, 358, 2776, 4378, 369, 847, 15549, 6733],
+                            [911, 19654, 382, 3838, 1558, 279, 16158, 1977, 979, 498, 2299, 4460, 311, 10542, 432, 518]],
+         'logprobs': [[-5.193126201629639, -0.05592319369316101, -4.861808776855469, -1.673396110534668, -2.6316866874694824, -0.2861405313014984, -0.35006725788116455, -5.23351526260376, -0.1447441577911377, -5.21489953994751, -1.6022650003433228, -1.9649192094802856, -2.1338791847229004, -1.2775304317474365, -10.004860877990723, -4.171003818511963],
+                      [-0.012896230444312096, -5.747106552124023, -1.5248860120773315, -1.9286258220672607, -2.8512537479400635, -2.8055880069732666, -3.019822835922241, -0.37132859230041504, -0.6311739087104797, -2.562908411026001, -3.1664533615112305, -2.685293436050415, -0.007259538397192955, -7.339841842651367, -1.188662052154541, -3.54781436920166]]}
+
+        >>> from transformers import AutoModelForCausalLM
+
+        >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-7B", device_map="cuda")
+        >>> client.init_communicator(device="cuda")
+        >>> client.update_model_params(model)
+        ```
+
+        There are several ways to initialize the client:
+
+        ```python
+        VLLMClient(base_url="http://localhost:8000")
+        VLLMClient(base_url="http://192.168.1.100:8000")
+        VLLMClient(host="localhost", server_port=8000)
+        VLLMClient(host="192.168.1.100", server_port=8000)
+        ```
+    """
+
+    def __init__(
+        self,
+        base_url: str | None = None,
+        host: str = "0.0.0.0",
+        server_port: int = 8000,
+        group_port: int = 51216,
+        connection_timeout: float = 0.0,
+    ):
+        if not is_requests_available():
+            raise ImportError(
+                "requests is not installed. Please install it with `pip install requests`."
+            )
+        if not is_vllm_available():
+            raise ImportError(
+                "vLLM is not installed. Please install it with `pip install trl[vllm]`."
+            )
+
+        self.session = requests.Session()
+
+        if base_url is not None:
+            # Parse the base_url to extract host and port
+            parsed_url = urlparse(base_url)
+            self.host = socket.gethostbyname(parsed_url.hostname)
+            scheme = parsed_url.scheme or "http"
+            self.base_url = f"{scheme}://{parsed_url.netloc}{parsed_url.path}"
+        else:
+            self.host = host
+            self.server_port = server_port
+            self.base_url = f"http://{self.host}:{self.server_port}"
+        self.group_port = group_port
+        self.check_server(connection_timeout)  # check server and fail after timeout
+
+    def check_server(self, total_timeout: float = 0.0, retry_interval: float = 2.0):
+        """
+        Check server availability with retries on failure, within a total timeout duration. If the server is not up
+        after the total timeout duration, raise a `ConnectionError`.
+
+        Args:
+            retry_interval (`float`, *optional*, defaults to `2.0`):
+                Interval in seconds between retries.
+            total_timeout (`float`, *optional*, defaults to `0.0`):
+                Total timeout duration in seconds.
+        """
+        url = f"{self.base_url}/health/"
+        start_time = time.time()  # Record the start time
+
+        while True:
+            try:
+                response = requests.get(url)
+            except requests.exceptions.RequestException as exc:
+                # Check if the total timeout duration has passed
+                elapsed_time = time.time() - start_time
+                if elapsed_time >= total_timeout:
+                    raise ConnectionError(
+                        f"The vLLM server can't be reached at {self.base_url} after {total_timeout} seconds. Make "
+                        "sure the server is running by running `trl vllm-serve`."
+                    ) from exc
+            else:
+                if response.status_code == 200:
+                    if "X-Forwarded-For" in response.headers:
+                        self.host = response.headers["X-Forwarded-For"]
+                    logger.info("Server is up!")
+                    return
+
+            # Retry logic: wait before trying again
+            logger.info(
+                f"Server is not up yet. Retrying in {retry_interval} seconds..."
+            )
+            time.sleep(retry_interval)
+
+    def generate(
+        self,
+        prompts: list[str],
+        images: list | None = None,
+        n: int = 1,
+        repetition_penalty: float = 1.0,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        min_p: float = 0.0,
+        max_tokens: int = 16,
+        truncate_prompt_tokens: int | None = None,
+        guided_decoding_regex: str | None = None,
+        generation_kwargs: dict | None = None,
+    ) -> dict[str, list[list[int]]]:
+        """
+        Generates model completions for the provided prompts.
+
+        Args:
+            prompts (`list[str]`):
+                List of text prompts for which the model will generate completions.
+            images (`list[PIL.Image]`, *optional*):
+                List of PIL Images to send along with the prompts.
+            n (`int`, *optional*, defaults to `1`):
+                Number of completions to generate for each prompt.
+            repetition_penalty (`float`, *optional*, defaults to `1.0`):
+                Parameter for repetition penalty. 1.0 means no penalty.
+            temperature (`float`, *optional*, defaults to `1.0`):
+                Temperature parameter for sampling. Higher values increase diversity.
+            top_p (`float`, *optional*, defaults to `1.0`):
+                Top-p sampling parameter.`1.0` means no truncation.
+            top_k (`int`, *optional*, defaults to `-1`):
+                Top-k sampling parameter. `-1` means no truncation.
+            min_p (`float`, *optional*, defaults to `0.0`):
+                Minimum probability for sampling.
+            max_tokens (`int`, *optional*, defaults to `16`):
+                Maximum number of tokens to generate for each prompt.
+            truncate_prompt_tokens (`int`, *optional*):
+                If set to `-1`, will use the truncation size supported by the model. If set to an integer k, will use
+                only the last k tokens from the prompt (i.e., left truncation). If set to `None`, truncation is
+                disabled.
+            guided_decoding_regex (`str`, *optional*):
+                Regular expression to guide the decoding process.
+            generation_kwargs (`dict`, *optional*):
+                Additional generation parameters to pass to the vLLM `SamplingParams`. This can include parameters like
+                `seed`, `frequency_penalty`, etc. If it contains keys that conflict with the other parameters, they
+                will override them.
+
+        Returns
+        -------
+            `dict` with keys:
+                - `prompt_ids` (`list[list[int]]`):
+                    List of lists of token IDs representing the tokenized input prompts.
+                - `completion_ids` (`list[list[int]]`):
+                    List of lists of token IDs representing the model-generated completions for each prompt.
+                - `logprobs` (`list[list[float]]`):
+                    List of lists of log probabilities for each generated token.
+        """
+        url = f"{self.base_url}/generate/"
+
+        # Convert PIL images to base64 strings
+        images = [pil_to_base64(img) for img in images] if images else None
+
+        response = self.session.post(
+            url,
+            json={
+                "prompts": prompts,
+                "images": images,
+                "n": n,
+                "repetition_penalty": repetition_penalty,
+                "temperature": temperature,
+                "top_p": top_p,
+                "top_k": top_k,
+                "min_p": min_p,
+                "max_tokens": max_tokens,
+                "truncate_prompt_tokens": truncate_prompt_tokens,
+                "guided_decoding_regex": guided_decoding_regex,
+                "generation_kwargs": generation_kwargs or {},
+            },
+        )
+        if response.status_code == 200:
+            json_response = response.json()
+            return {
+                "prompt_ids": json_response["prompt_ids"],
+                "completion_ids": json_response["completion_ids"],
+                "logprobs": json_response["logprobs"],
+            }
+        raise Exception(f"Request failed: {response.status_code}, {response.text}")
+
+    def chat(
+        self,
+        messages: list[list[dict]],
+        n: int = 1,
+        repetition_penalty: float = 1.0,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        min_p: float = 0.0,
+        max_tokens: int = 16,
+        truncate_prompt_tokens: int | None = None,
+        guided_decoding_regex: str | None = None,
+        generation_kwargs: dict | None = None,
+        chat_template_kwargs: dict | None = None,
+    ) -> dict[str, list[list[int]]]:
+        """
+        Generates model completions for the provided chat messages.
+
+        Args:
+            messages (`list[list[dict]]`):
+                List of message lists for which the model will generate completions. Each message is a dictionary with
+                keys like "role" and "content".
+            n (`int`, *optional*, defaults to `1`):
+                Number of completions to generate for each message list.
+            repetition_penalty (`float`, *optional*, defaults to `1.0`):
+                Parameter for repetition penalty. 1.0 means no penalty.
+            temperature (`float`, *optional*, defaults to `1.0`):
+                Temperature parameter for sampling. Higher values increase diversity.
+            top_p (`float`, *optional*, defaults to `1.0`):
+                Top-p sampling parameter.`1.0` means no truncation.
+            top_k (`int`, *optional*, defaults to `-1`):
+                Top-k sampling parameter. `-1` means no truncation.
+            min_p (`float`, *optional*, defaults to `0.0`):
+                Minimum probability for sampling.
+            max_tokens (`int`, *optional*, defaults to `16`):
+                Maximum number of tokens to generate for each message list.
+            truncate_prompt_tokens (`int`, *optional*):
+                If set to `-1`, will use the truncation size supported by the model. If set to an integer k, will use
+                only the last k tokens from the prompt (i.e., left truncation). If set to `None`, truncation is
+                disabled.
+            guided_decoding_regex (`str`, *optional*):
+                Regular expression to guide the decoding process.
+            generation_kwargs (`dict`, *optional*):
+                Additional generation parameters to pass to the vLLM `SamplingParams`. This can include parameters like
+                `seed`, `frequency_penalty`, etc. If it contains keys that conflict with the other parameters, they
+                will override them.
+            chat_template_kwargs (`dict`, *optional*):
+                Additional keyword arguments to customize the chat template used by the model.
+
+        Returns
+        -------
+            `dict` with keys:
+                - `prompt_ids` (`list[list[int]]`):
+                    List of lists of token IDs representing the tokenized input messages.
+                - `completion_ids` (`list[list[int]]`):
+                    List of lists of token IDs representing the model-generated completions for each message list.
+                - `logprobs` (`list[list[float]]`):
+                    List of lists of log probabilities for each generated token.
+        """
+        url = f"{self.base_url}/chat/"
+
+        # Convert PIL images to base64 strings
+        messages = copy.deepcopy(messages)  # avoid modifying the original messages
+        for message_list in messages:
+            for message in message_list:
+                if isinstance(message["content"], list):
+                    for part in message["content"]:
+                        if part["type"] == "image_pil":
+                            part["image_pil"] = pil_to_base64(part["image_pil"])
+
+        response = self.session.post(
+            url,
+            json={
+                "messages": messages,
+                "n": n,
+                "repetition_penalty": repetition_penalty,
+                "temperature": temperature,
+                "top_p": top_p,
+                "top_k": top_k,
+                "min_p": min_p,
+                "max_tokens": max_tokens,
+                "truncate_prompt_tokens": truncate_prompt_tokens,
+                "guided_decoding_regex": guided_decoding_regex,
+                "generation_kwargs": generation_kwargs or {},
+                "chat_template_kwargs": chat_template_kwargs or {},
+            },
+        )
+        if response.status_code == 200:
+            json_response = response.json()
+            return {
+                "prompt_ids": json_response["prompt_ids"],
+                "completion_ids": json_response["completion_ids"],
+                "logprobs": json_response["logprobs"],
+            }
+        raise Exception(f"Request failed: {response.status_code}, {response.text}")
+
+    def init_communicator(self, device: torch.device | str | int = 0):
+        """
+        Initializes the weight update group in a distributed setup for model synchronization.
+
+        Args:
+            device (`torch.device`, `str`, or `int`, *optional*, defaults to `0`):
+                Device of trainer main process. It's the device that will be used for the weights synchronization. Can
+                be a `torch.device` object, a string like `'cuda:0'`, or an integer device index.
+        """
+        # Get the world size from the server
+        url = f"{self.base_url}/get_world_size/"
+        response = requests.get(url)
+        if response.status_code == 200:
+            vllm_world_size = response.json()["world_size"]
+        else:
+            raise Exception(f"Request failed: {response.status_code}, {response.text}")
+
+        world_size = vllm_world_size + 1  # add the client to the world
+        self.rank = vllm_world_size  # the client's rank is the last process
+
+        # Initialize weight update group
+        url = f"{self.base_url}/init_communicator/"
+        # Will simplify it after torch xpu 2.9 support get uuid.
+        if is_torch_xpu_available():
+            if hasattr(torch.xpu.get_device_properties(device), "uuid"):
+                client_device_uuid = str(torch.xpu.get_device_properties(device).uuid)
+            else:
+                client_device_uuid = "42"
+        else:
+            client_device_uuid = str(torch.cuda.get_device_properties(device).uuid)
+
+        # Set the weight update group's host to "0.0.0.0" so that
+        # clients from different IPs can send updated weights
+        response = self.session.post(
+            url,
+            json={
+                "host": "0.0.0.0",
+                "port": self.group_port,
+                "world_size": world_size,
+                "client_device_uuid": client_device_uuid,
+            },
+        )
+        if response.status_code != 200:
+            raise Exception(f"Request failed: {response.status_code}, {response.text}")
+
+        # Brief delay to allow server initialization. While not strictly required (client socket will retry on
+        # connection failure), this prevents log warnings like:
+        # [W416 23:24:57.460001114 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
+        time.sleep(0.1)
+
+        # Set up the communication group for weight broadcasting
+        if is_torch_xpu_available():
+            store = torch.distributed.TCPStore(
+                host_name=self.host,
+                port=self.group_port,
+                world_size=world_size,
+                is_master=(self.rank == 0),
+            )
+            prefixed_store = c10d.PrefixStore("client2server", store)
+            pg = c10d.ProcessGroupXCCL(
+                store=prefixed_store,
+                rank=self.rank,
+                size=world_size,
+            )
+            self.communicator = pg
+        else:
+            pg = StatelessProcessGroup.create(
+                host=self.host,
+                port=self.group_port,
+                rank=self.rank,
+                world_size=world_size,
+            )
+            self.communicator = PyNcclCommunicator(pg, device=device)
+
+        # When the client object is deleted, close the weight update group
+        atexit.register(self.close_communicator)
+
+    def update_named_param(self, name: str, weights: torch.Tensor):
+        """
+        Updates a specific named parameter in the model and broadcasts it to other processes.
+
+        Args:
+            name (`str`):
+                Name of the layer whose weights are being updated.
+            weights (`torch.Tensor`):
+                Tensor containing the updated weights.
+        """
+        dtype, shape = str(weights.dtype), tuple(weights.shape)
+        url = f"{self.base_url}/update_named_param/"
+        response = self.session.post(
+            url, json={"name": name, "dtype": dtype, "shape": shape}
+        )
+        if response.status_code != 200:
+            raise Exception(f"Request failed: {response.status_code}, {response.text}")
+
+        if is_torch_xpu_available():
+            # Use XCCL to broadcast the updated weights from the client (src) to all workers.
+            self.communicator.broadcast(weights, root=self.rank)
+            self.communicator.barrier()
+        else:
+            # Use NCCL to broadcast the updated weights from the client (src) to all workers.
+            self.communicator.broadcast(weights, src=self.rank)
+            self.communicator.group.barrier()
+
+    def update_model_params(self, model: nn.Module):
+        """
+        Updates all parameters of the given model by calling `update_named_param` for each parameter in the model.
+
+        Args:
+            model (`nn.Module`):
+                Model whose parameters (weights/biases) are to be updated.
+        """
+        for name, param in model.named_parameters():
+            # Update each parameter individually
+            self.update_named_param(name, param.data)
+
+    def reset_prefix_cache(self):
+        """
+        Resets the prefix cache for the model.
+        """
+        url = f"{self.base_url}/reset_prefix_cache/"
+        response = self.session.post(url)
+        if response.status_code != 200:
+            raise Exception(f"Request failed: {response.status_code}, {response.text}")
+
+    def close_communicator(self):
+        """
+        Closes the weight update group and cleans up the communication group.
+        """
+        url = f"{self.base_url}/close_communicator/"
+
+        try:
+            response = self.session.post(url)
+        except ConnectionError:
+            # The server might be already down, so we don't need to close the communicator
+            pass
+        else:
+            if response.status_code != 200:
+                raise Exception(
+                    f"Request failed: {response.status_code}, {response.text}"
+                )
+
+
+# Example usage
+if __name__ == "__main__":
+    from vllm import SamplingParams
+
+    device = "xpu" if is_torch_xpu_available() else "cuda"
+    client = VLLMClient()
+    client.init_communicator(device=device)
+
+    # Generate completions
+    responses = client.generate(
+        ["Hello, AI!", "Tell me a joke"],
+        n=4,
+        max_tokens=32,
+        sampling_params=SamplingParams(),
+    )
+    print("Responses:", responses)  # noqa
+
+    # Update model weights
+    from transformers import AutoModelForCausalLM
+
+    model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-7B").to(device)
+    client.update_model_params(model)
diff --git a/src/aixpert/training/training/trl/import_utils.py b/src/aixpert/training/training/trl/import_utils.py
new file mode 100644
index 0000000..edc3d6b
--- /dev/null
+++ b/src/aixpert/training/training/trl/import_utils.py
@@ -0,0 +1,172 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import os
+import warnings
+from itertools import chain
+from types import ModuleType
+from typing import Any
+
+from packaging import version
+from transformers.utils.import_utils import _is_package_available
+
+
+LIGER_KERNEL_MIN_VERSION = "0.5.8"
+
+# Use same as transformers.utils.import_utils
+_deepspeed_available = _is_package_available("deepspeed")
+_fastapi_available = _is_package_available("fastapi")
+_joblib_available = _is_package_available("joblib")
+_liger_kernel_available, _liger_kernel_version = _is_package_available(
+    "liger_kernel", return_version=True
+)
+_llm_blender_available = _is_package_available("llm_blender")
+_math_verify_available = _is_package_available("math_verify")
+_mergekit_available = _is_package_available("mergekit")
+_pydantic_available = _is_package_available("pydantic")
+_requests_available = _is_package_available("requests")
+_unsloth_available = _is_package_available("unsloth")
+_uvicorn_available = _is_package_available("uvicorn")
+_vllm_available, _vllm_version = _is_package_available("vllm", return_version=True)
+_vllm_ascend_available = _is_package_available("vllm_ascend")
+_weave_available = _is_package_available("weave")
+
+
+def is_deepspeed_available() -> bool:
+    return _deepspeed_available
+
+
+def is_fastapi_available() -> bool:
+    return _fastapi_available
+
+
+def is_joblib_available() -> bool:
+    return _joblib_available
+
+
+def is_liger_kernel_available(min_version: str = LIGER_KERNEL_MIN_VERSION) -> bool:
+    return _liger_kernel_available and version.parse(
+        _liger_kernel_version
+    ) >= version.parse(min_version)
+
+
+def is_llm_blender_available() -> bool:
+    return _llm_blender_available
+
+
+def is_math_verify_available() -> bool:
+    return _math_verify_available
+
+
+def is_mergekit_available() -> bool:
+    return _mergekit_available
+
+
+def is_pydantic_available() -> bool:
+    return _pydantic_available
+
+
+def is_requests_available() -> bool:
+    return _requests_available
+
+
+def is_unsloth_available() -> bool:
+    return _unsloth_available
+
+
+def is_uvicorn_available() -> bool:
+    return _uvicorn_available
+
+
+def is_vllm_available() -> bool:
+    if _vllm_available and version.parse(_vllm_version) != version.parse("0.10.2"):
+        warnings.warn(
+            f"TRL currently only supports vLLM version `0.10.2`. You have version {_vllm_version} installed. We "
+            "recommend to install this version to avoid compatibility issues.",
+            UserWarning,
+        )
+    return _vllm_available
+
+
+def is_vllm_ascend_available() -> bool:
+    return _vllm_ascend_available
+
+
+def is_weave_available() -> bool:
+    return _weave_available
+
+
+class _LazyModule(ModuleType):
+    """
+    Module class that surfaces all objects but only performs associated imports when the objects are requested.
+    """
+
+    # Very heavily inspired by optuna.integration._IntegrationModule
+    # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
+    def __init__(
+        self, name, module_file, import_structure, module_spec=None, extra_objects=None
+    ):
+        super().__init__(name)
+        self._modules = set(import_structure.keys())
+        self._class_to_module = {}
+        for key, values in import_structure.items():
+            for value in values:
+                self._class_to_module[value] = key
+        # Needed for autocompletion in an IDE
+        self.__all__ = list(import_structure.keys()) + list(
+            chain(*import_structure.values())
+        )
+        self.__file__ = module_file
+        self.__spec__ = module_spec
+        self.__path__ = [os.path.dirname(module_file)]
+        self._objects = {} if extra_objects is None else extra_objects
+        self._name = name
+        self._import_structure = import_structure
+
+    # Needed for autocompletion in an IDE
+    def __dir__(self):
+        result = super().__dir__()
+        # The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
+        # they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
+        for attr in self.__all__:
+            if attr not in result:
+                result.append(attr)
+        return result
+
+    def __getattr__(self, name: str) -> Any:
+        if name in self._objects:
+            return self._objects[name]
+        if name in self._modules:
+            value = self._get_module(name)
+        elif name in self._class_to_module.keys():
+            module = self._get_module(self._class_to_module[name])
+            value = getattr(module, name)
+        else:
+            raise AttributeError(f"module {self.__name__} has no attribute {name}")
+
+        setattr(self, name, value)
+        return value
+
+    def _get_module(self, module_name: str):
+        try:
+            return importlib.import_module("." + module_name, self.__name__)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its"
+                f" traceback):\n{e}"
+            ) from e
+
+    def __reduce__(self):
+        return (self.__class__, (self._name, self.__file__, self._import_structure))
diff --git a/src/aixpert/training/training/trl/mergekit_utils.py b/src/aixpert/training/training/trl/mergekit_utils.py
new file mode 100644
index 0000000..cc1b924
--- /dev/null
+++ b/src/aixpert/training/training/trl/mergekit_utils.py
@@ -0,0 +1,297 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from huggingface_hub import HfApi
+
+from .import_utils import is_mergekit_available
+
+
+if is_mergekit_available():
+    from mergekit.config import MergeConfiguration
+    from mergekit.merge import MergeOptions, run_merge
+
+
+def upload_model_to_hf(folder_path: str, repo_id: str):
+    api = HfApi()
+    # Create the repository if it doesn't exist
+    repo = api.create_repo(repo_id, repo_type="model")
+
+    # Upload the folder to the specified repository
+    api.upload_folder(
+        folder_path=folder_path,
+        repo_id=repo.repo_id,
+        repo_type=repo.repo_type,
+    )
+
+
+class MergeConfig:
+    r"""
+    Configuration class for merging two models using `mergekit`.
+
+    This class provides a structured way to configure and generate merge configurations for various merge methods, such
+    as `linear`, `ties`, `dare_ties`, and `slerp`.
+
+    Args:
+        method (`str`, *optional*, defaults to `"linear"`):
+            Merge method to use. Supported methods include:
+
+            - `"linear"`: Linearly combines two models with specified weights.
+            - `"ties"`: Combines two models using the TIES method with density parameters.
+            - `"dare_ties"`: A variant of TIES for domain adaptation.
+            - `"slerp"`: Combines models using spherical linear interpolation.
+
+    Note:
+
+        For more details about the merge methods and how they are implemented, see the [MergeKit GitHub
+        repository](https://github.com/arcee-ai/mergekit?tab=readme-ov-file#merge-methods).
+
+    Attributes
+    ----------
+        method (`str`): The merge method to use.
+        policy_model_path (`str` or `None`): Path to the policy model.
+        target_model_path (`str` or `None`): Path to the target model.
+        policy_model_weight (`float`): Weight for the policy model (for `linear` and `ties` methods).
+        target_model_weight (`float`): Weight for the target model (for `linear` and `ties` methods).
+        policy_model_density (`list[float]`): Density parameters for the policy model (for `ties` and `dare_ties`).
+        target_model_density (`list[float]`): Density parameters for the target model (for `ties` and `dare_ties`).
+        normalize (`float` or `None`): Normalization factor for the TIES method.
+        t_values (`float` or `None`): Interpolation factor for the SLERP method.
+        dtype (`str`): Data type to use for merging, e.g., `"float16"`.
+    """
+
+    def __init__(self, method: str = "linear"):
+        if not is_mergekit_available():
+            raise ImportError(
+                "MergeConfig requires the `mergekit` extra. To install, run `pip install mergekit`."
+            )
+        self.method = method
+        self.policy_model_path = None
+        self.target_model_path = None
+
+        # Initialize relevant parameters based on the method
+        if method == "linear":
+            self.policy_model_weight = 0.5
+            self.target_model_weight = 0.5
+            self.dtype = "float16"
+        elif method == "ties" or method == "dare_ties":
+            self.policy_model_weight = 1.0
+            self.policy_model_density = [1.0, 0.7, 0.1]
+            self.target_model_weight = 1.0
+            self.target_model_density = [1.0]
+            self.normalize = 1.0
+            self.dtype = "float16"
+        elif method == "slerp":
+            self.t_values = 0.5
+            self.dtype = "float16"
+        else:
+            raise ValueError(f"Unsupported merge method: {method}")
+
+    def create_merge_config_linear(self) -> "MergeConfiguration":
+        """
+        Creates a merge configuration for a linear merge of two models with specified weights.
+        """
+        # Create the merge configuration dictionary
+        merge_config_dict = {
+            "dtype": self.dtype,
+            "merge_method": "linear",
+            "models": [
+                {
+                    "model": self.policy_model_path,
+                    "parameters": {"weight": self.policy_model_weight},
+                },
+                {
+                    "model": self.target_model_path,
+                    "parameters": {"weight": self.target_model_weight},
+                },
+            ],
+        }
+
+        # Create the MergeConfiguration from the dictionary
+        merge_config = MergeConfiguration.model_validate(merge_config_dict)
+
+        return merge_config
+
+    def create_merge_config_ties(self) -> "MergeConfiguration":
+        """
+        Creates a merge configuration for a TIES merge of two models, with specified weights and densities.
+        """
+        # Create the TIES merge configuration dictionary
+        merge_config_dict = {
+            "merge_method": "ties",
+            "slices": None,  # Optional slices if needed
+            "models": [
+                {
+                    "model": {
+                        "model": {"path": self.target_model_path, "revision": None},
+                        "lora": None,
+                        "override_architecture": None,
+                    },
+                    "parameters": {
+                        "density": self.target_model_density,
+                        "weight": self.target_model_weight,
+                    },
+                },
+                {
+                    "model": {
+                        "model": {"path": self.policy_model_path, "revision": None},
+                        "lora": None,
+                        "override_architecture": None,
+                    },
+                    "parameters": {
+                        "density": self.policy_model_density,
+                        "weight": self.policy_model_weight,
+                    },
+                },
+            ],
+            "parameters": {"normalize": self.normalize},
+            "base_model": {
+                "model": {"path": self.policy_model_path, "revision": None},
+                "lora": None,
+                "override_architecture": None,
+            },
+            "dtype": self.dtype,
+            "tokenizer_source": None,
+            "tokenizer": None,
+            "chat_template": None,
+            "out_dtype": None,
+        }
+
+        # Create the MergeConfiguration from the dictionary
+        merge_config = MergeConfiguration.model_validate(merge_config_dict)
+
+        return merge_config
+
+    def create_merge_config_dare_ties(self) -> "MergeConfiguration":
+        """
+        Creates a merge configuration for a DARE TIES merge of two models, with specified weights and densities.
+        """
+        # Create the DARE TIES merge configuration dictionary
+        merge_config_dict = {
+            "merge_method": "dare_ties",
+            "slices": None,  # Optional slices if needed
+            "models": [
+                {
+                    "model": {
+                        "model": {"path": self.target_model_path, "revision": None},
+                        "lora": None,
+                        "override_architecture": None,
+                    },
+                    "parameters": {
+                        "density": self.target_model_density,
+                        "weight": self.target_model_weight,
+                    },
+                },
+                {
+                    "model": {
+                        "model": {"path": self.policy_model_path, "revision": None},
+                        "lora": None,
+                        "override_architecture": None,
+                    },
+                    "parameters": {
+                        "density": self.policy_model_density,
+                        "weight": self.policy_model_weight,
+                    },
+                },
+            ],
+            "parameters": {"normalize": self.normalize},
+            "base_model": {
+                "model": {"path": self.policy_model_path, "revision": None},
+                "lora": None,
+                "override_architecture": None,
+            },
+            "dtype": self.dtype,
+            "tokenizer_source": None,
+            "tokenizer": None,
+            "chat_template": None,
+            "out_dtype": None,
+        }
+
+        # Create the MergeConfiguration from the dictionary
+        merge_config = MergeConfiguration.model_validate(merge_config_dict)
+
+        return merge_config
+
+    def create_merge_config_slerp(self) -> "MergeConfiguration":
+        """
+        Creates a merge configuration for a SLERP merge of a model with a base model.
+        """
+        # Create the SLERP merge configuration dictionary
+        merge_config_dict = {
+            "merge_method": "slerp",
+            "slices": None,  # Optional slices if needed
+            "models": [
+                {
+                    "model": {
+                        "model": {"path": self.target_model_path, "revision": None},
+                        "lora": None,
+                        "override_architecture": None,
+                    },
+                    "parameters": None,  # No specific parameters for SLERP model
+                }
+            ],
+            "parameters": {
+                "t": self.t_values  # Set the t values for SLERP
+            },
+            "base_model": {
+                "model": {"path": self.policy_model_path, "revision": None},
+                "lora": None,
+                "override_architecture": None,
+            },
+            "dtype": self.dtype,
+            "tokenizer_source": None,
+            "tokenizer": None,
+            "chat_template": None,
+            "out_dtype": None,
+        }
+
+        # Create the MergeConfiguration from the dictionary
+        merge_config = MergeConfiguration.model_validate(merge_config_dict)
+
+        return merge_config
+
+    def create(self) -> "MergeConfiguration":
+        if self.method == "linear":
+            return self.create_merge_config_linear()
+        if self.method == "ties":
+            return self.create_merge_config_ties()
+        if self.method == "dare_ties":
+            return self.create_merge_config_dare_ties()
+        if self.method == "slerp":
+            return self.create_merge_config_slerp()
+
+
+def merge_models(config: MergeConfig, out_path: str):
+    """
+    Merge two models using mergekit
+
+    Args:
+        config ([`MergeConfig`]): The merge configuration.
+        out_path (`str`): The output path for the merged model.
+    """
+    if not is_mergekit_available():
+        raise ImportError(
+            "merge_models requires the `mergekit` extra. To install, run `pip install mergekit`."
+        )
+    run_merge(
+        config,
+        out_path=out_path,
+        options=MergeOptions(
+            device="auto",
+            cuda=torch.cuda.is_available(),
+            copy_tokenizer=True,
+            lazy_unpickle=False,
+            low_cpu_memory=False,
+        ),
+    )
diff --git a/src/aixpert/training/training/trl/models/__init__.py b/src/aixpert/training/training/trl/models/__init__.py
new file mode 100644
index 0000000..ce4492b
--- /dev/null
+++ b/src/aixpert/training/training/trl/models/__init__.py
@@ -0,0 +1,70 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ..import_utils import _LazyModule
+
+
+_import_structure = {
+    "activation_offloading": ["get_act_offloading_ctx_manager"],
+    "modeling_base": [
+        "GeometricMixtureWrapper",
+        "PreTrainedModelWrapper",
+        "create_reference_model",
+    ],
+    "modeling_value_head": [
+        "AutoModelForCausalLMWithValueHead",
+        "AutoModelForSeq2SeqLMWithValueHead",
+    ],
+    "utils": [
+        "SUPPORTED_ARCHITECTURES",
+        "clone_chat_template",
+        "prepare_deepspeed",
+        "prepare_fsdp",
+        "prepare_model_for_kbit_training",
+        "prepare_peft_model",
+        "setup_chat_format",
+        "unwrap_model_for_generation",
+    ],
+}
+
+
+if TYPE_CHECKING:
+    from .activation_offloading import get_act_offloading_ctx_manager
+    from .modeling_base import (
+        GeometricMixtureWrapper,
+        PreTrainedModelWrapper,
+        create_reference_model,
+    )
+    from .modeling_value_head import (
+        AutoModelForCausalLMWithValueHead,
+        AutoModelForSeq2SeqLMWithValueHead,
+    )
+    from .utils import (
+        SUPPORTED_ARCHITECTURES,
+        clone_chat_template,
+        prepare_deepspeed,
+        prepare_fsdp,
+        prepare_model_for_kbit_training,
+        prepare_peft_model,
+        setup_chat_format,
+        unwrap_model_for_generation,
+    )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__, globals()["__file__"], _import_structure, module_spec=__spec__
+    )
diff --git a/src/aixpert/training/training/trl/models/activation_offloading.py b/src/aixpert/training/training/trl/models/activation_offloading.py
new file mode 100644
index 0000000..9c9bcb1
--- /dev/null
+++ b/src/aixpert/training/training/trl/models/activation_offloading.py
@@ -0,0 +1,794 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of https://github.com/pytorch/torchtune.
+
+
+import psutil
+import torch
+from accelerate import logging
+from accelerate.utils.versions import is_torch_version
+from torch import nn
+from torch.autograd.graph import saved_tensors_hooks
+from transformers import is_torch_npu_available
+
+
+if is_torch_npu_available():
+    import torch_npu  # noqa: F401
+
+# Import DTensor for FSDP v2 support with version-aware import path
+DTensor = None
+if torch.distributed.is_available():
+    try:
+        if is_torch_version(">=", "2.5.0"):
+            from torch.distributed.tensor import DTensor
+        else:
+            # from torch 2.0.0 (oldest supported accelerate torch version), DTensor is in torch.distributed._tensor
+            from torch.distributed._tensor import DTensor
+    except (ImportError, AttributeError):
+        DTensor = None
+
+logger = logging.get_logger(__name__)
+
+
+def _get_unique_tensor_key(tensor: torch.Tensor) -> tuple:
+    """
+    Get a unique key for a tensor based on its storage pointer and dtype. This allows deduplication of tensors that
+    share the same underlying storage. From:
+    https://github.com/volcengine/verl/blob/main/verl/utils/activation_offload.py
+
+    Args:
+        tensor: The tensor to get the key for
+
+    Returns
+    -------
+        A tuple of (storage_pointer, dtype) that uniquely identifies the tensor's storage
+    """
+    # Handle special tensor types - primarily for FSDP v2 DTensor
+    actual_tensor = tensor
+
+    # For DTensor (FSDP v2), extract the local tensor
+    if (
+        DTensor is not None
+        and isinstance(tensor, DTensor)
+        and hasattr(tensor, "_local_tensor")
+    ):
+        actual_tensor = tensor._local_tensor
+
+    # Try to get storage pointer, but fall back to tensor id if not accessible
+    try:
+        storage_ptr = (
+            actual_tensor.untyped_storage().data_ptr() + actual_tensor.storage_offset()
+        )
+    except (RuntimeError, AttributeError):
+        # For tensors with invalid storage, use tensor id
+        # This won't enable deduplication for these tensors, but allows offloading to work
+        storage_ptr = id(actual_tensor)
+
+    return (storage_ptr, actual_tensor.dtype)
+
+
+class OffloadActivations(saved_tensors_hooks):
+    """
+    Context manager under which activation tensors created in the forward pass will be offloaded.
+
+    Enable the memory efficiency technique of activation offloading, where activations bigger than `min_offload_size`
+    bytes will be offloaded to CPU in the forward and brought back in the backward. This is in contrast to maintaining
+    the activation on GPU VRAM throughout the program.
+
+    This manager contains the option of using one additional CUDA stream to handle the communication between CUDA and
+    CPU, which is intended to overlap with the default computation stream to improve runtime. We designed
+    synchronization with a few heuristics for optimizing the tradeoff between runtime vs memory usage.
+
+    Args:
+        use_pin_memory (`bool`, *optional*, defaults to `True`):
+            Whether to offloaded Tensor will be placed in pinned memory on the CPU. Pinned memory allows the Tensor to
+            be moved back onto GPU more quickly but is a limited resource.
+        use_streams (`bool`, *optional*, defaults to `True`):
+            Whether to use streams for performance optimization where the communications get overlapped with the
+            computation. Requires a torch build after torch-2.5.0.
+        min_offload_size (`int`, *optional*, defaults to `1024`):
+            Minimum number of bytes a Tensor must be in order to qualify for offloading. If the tensor is too small, we
+            do not want to waste bandwidth and resources moving it to CPU and back.
+        max_fwd_stash_size (`int`, *optional*, defaults to `5`):
+            Maximum size of the forward stash, or the maximum number of consecutive activations to keep alive during
+            the forward pass. This number must be at least 1. Keeping alive more activations will potentially allow
+            more overlap between the communication and compute streams at the cost of increasing memory usage. Keeping
+            alive fewer activations will conserve memory, but may cause poor overlap between the streams, increasing
+            runtime.
+
+    Raises
+    ------
+        ValueError: if `max_fwd_stash_size` is not at least `1`.
+
+    Example:
+    ```python
+    >>> with OffloadActivations():
+    ...     outputs = model(inputs, labels=labels)
+    >>> loss = outputs.loss
+    >>> loss.backward()
+    ```
+    """
+
+    def __init__(
+        self,
+        use_pin_memory: bool = True,
+        use_streams: bool = True,
+        min_offload_size: int = 1024,
+        max_fwd_stash_size: int = 5,
+    ) -> None:
+        self.use_streams = use_streams
+
+        self.min_tensor_size_bytes = (
+            min_offload_size  # we don't want to bother with small tensors
+        )
+        self.tracker = {}  # tensor_id => (new_tensor, if_modified)  ---> track what saved/offloaded tensors are where
+        self.tensor_id = 0
+        self.is_first_forward_call = True
+        self.is_first_backward_call = True
+        self.is_first_forward_pass = True
+
+        # Storage deduplication: maps storage key to tensor_id to avoid offloading same storage multiple times
+        self.storage_to_tensor_id = {}
+
+        # Parameter filtering: track parameter storage pointers to skip them during offloading
+        self.param_storages = set()
+
+        # Managing cpu memory
+        self.use_pin_memory = use_pin_memory
+        self.virtual_memory_safe_pct = (
+            60  # we should not exceed this percentage of memory
+        )
+
+        self.accelerator_type = (
+            torch.accelerator.current_accelerator().type
+            if hasattr(torch, "accelerator")
+            else "cuda"
+        )
+        # NOTE: xpu doesn't have `default_stream` API, use `current_stream` instead
+        if self.accelerator_type == "xpu":  # comp stream
+            self.s0 = torch.xpu.current_stream()
+        elif is_torch_npu_available() and self.accelerator_type == "npu":
+            self.s0 = torch.npu.current_stream()
+        else:
+            self.s0 = torch.cuda.default_stream()
+
+        # For streaming
+        if self.use_streams:
+            if self.accelerator_type == "xpu":  # comms stream
+                self.s1 = torch.xpu.Stream()
+            elif self.accelerator_type == "npu":
+                self.s1 = torch.npu.Stream()
+            else:
+                self.s1 = torch.cuda.Stream()
+            self.fwd_stash = {}  # tensor_id => (activation, ev1)
+            if max_fwd_stash_size < 1:
+                raise ValueError(
+                    f"max_fwd_stash_size should be at least 1 but is {max_fwd_stash_size}"
+                )
+            self.max_fwd_stash_size = max_fwd_stash_size
+            self.bwd_tensor_stash = {}  # tensor_id => activation
+            self.bwd_ev_stash = {}  # tensor_id => ev0
+            self.curr_graph_id = None
+            self.curr_autograd_node = None
+
+        # -------- platform util functions -------- #
+        def verify_sufficient_virtual_memory():
+            curr_pct = get_cpu_ram_pct()
+            if curr_pct > self.virtual_memory_safe_pct:
+                logger.warning(
+                    f"{curr_pct=}% > {self.virtual_memory_safe_pct=}% of virtual memory used"
+                )
+
+        def get_cpu_ram_pct() -> float:
+            # get the percentage of memory used by the system
+            return psutil.virtual_memory().percent
+
+        def get_tensor_id() -> int:
+            # create a unique id for each tensor we are managing
+            self.tensor_id += 1
+            return self.tensor_id
+
+        def get_num_bytes_tensor(x: torch.Tensor) -> int:
+            # get the number of bytes in a tensor, for memory management purposes
+            return (
+                x.element_size() * x.nelement()
+            )  # x.element_size() * x._base_storage().nbytes()
+
+        # -------- core pack / unpack work -------- #
+        def pack_tensor(activation: torch.Tensor) -> int:
+            # activations are passed in during forward pass - from here we take over and return a unique id
+            if self.is_first_forward_call:
+                if len(self.tracker) != 0:
+                    raise ValueError(
+                        "Backward pass should have cleared tracker of all tensors"
+                    )
+
+                # set training phase trackers
+                self.is_first_forward_call = False
+                self.is_first_backward_call = True
+                # Reset deduplication map for new forward pass
+                self.storage_to_tensor_id = {}
+
+            # query for basic tensor info
+            num_bytes = get_num_bytes_tensor(activation)
+            tensor_id = get_tensor_id()
+
+            # Check for tensor deduplication using storage pointer
+            # If this storage is already being tracked, we still create a new tensor_id
+            # but don't offload again (just keep the tensor in GPU)
+            storage_key = _get_unique_tensor_key(activation)
+            if storage_key in self.storage_to_tensor_id:
+                # Storage already offloaded - don't offload again, just track the reference
+                self.tracker[tensor_id] = (
+                    activation,
+                    False,
+                    None,
+                    None,
+                    None,
+                )  # Keep on GPU, don't offload
+                return tensor_id
+
+            # Check if tensor is on CPU (skip offloading)
+            if activation.device.type not in ["cuda", "xpu", "npu"]:
+                self.tracker[tensor_id] = (activation, False, None, None, None)
+                return tensor_id
+
+            # Check if tensor is too small
+            if num_bytes < self.min_tensor_size_bytes:
+                self.tracker[tensor_id] = (activation, False, None, None, None)
+                return tensor_id
+
+            # Check if tensor is a parameter or buffer
+            if isinstance(activation, torch.nn.Parameter) or (
+                hasattr(torch.nn, "Buffer") and isinstance(activation, torch.nn.Buffer)
+            ):
+                self.tracker[tensor_id] = (activation, False, None, None, None)
+                return tensor_id
+
+            # Check if tensor is an FP8 tensor (TorchAO) - skip offloading as they're already compressed
+            tensor_class_name = type(activation).__name__
+            if tensor_class_name in [
+                "Float8TrainingTensor",
+                "ScaledMMConfig",
+                "LinearMMConfig",
+            ]:
+                self.tracker[tensor_id] = (activation, False, None, None, None)
+                return tensor_id
+
+            # Check if tensor storage is a model parameter (for FSDP compatibility)
+            try:
+                # Extract actual tensor for DTensor
+                check_tensor = activation
+                if (
+                    DTensor is not None
+                    and isinstance(activation, DTensor)
+                    and hasattr(activation, "_local_tensor")
+                ):
+                    check_tensor = activation._local_tensor
+
+                if check_tensor.untyped_storage().data_ptr() in self.param_storages:
+                    self.tracker[tensor_id] = (activation, False, None, None, None)
+                    return tensor_id
+            except (RuntimeError, AttributeError):
+                # If we can't get data_ptr, skip this check
+                pass
+
+            # Tensor qualifies for offloading
+            if self.use_streams:
+                # First, sync back and dereference previously offloaded tensors
+                # as the offloading should be done sufficiently long ago.
+                for id in list(self.fwd_stash.keys()):
+                    if id <= tensor_id - self.max_fwd_stash_size:
+                        _, ev = self.fwd_stash[id]
+                        self.s0.wait_event(ev)
+                        del self.fwd_stash[id]
+                    else:
+                        break
+
+                # Sync in, offload, and add an event to sync back later
+                self.s1.wait_stream(self.s0)
+
+            stream = self.s1 if self.use_streams else self.s0
+            if self.accelerator_type == "xpu":
+                stream_ctx = torch.xpu.stream(stream)
+            elif self.accelerator_type == "npu":
+                stream_ctx = torch.npu.stream(stream)
+            else:
+                stream_ctx = torch.cuda.stream(stream)
+            with stream_ctx:
+                # Save original stride and shape information
+                original_stride = activation.stride()
+                original_storage_offset = activation.storage_offset()
+                original_shape = activation.size()
+
+                # Check if tensor has broadcast dimensions (stride == 0)
+                # If so, copy the underlying storage directly instead of materializing the broadcast
+                has_broadcast = 0 in original_stride
+
+                if has_broadcast:
+                    # Copy only the actual underlying storage, not the materialized broadcast
+                    # Create CPU tensor with same storage size as original
+                    storage_size = activation.untyped_storage().size()
+                    cpu_storage = torch.empty(
+                        storage_size // activation.element_size(),
+                        dtype=activation.dtype,
+                        pin_memory=self.use_pin_memory,
+                        device="cpu",
+                    )
+                    # Copy the raw storage
+                    cpu_storage_view = torch.as_strided(
+                        activation,
+                        size=(storage_size // activation.element_size(),),
+                        stride=(1,),
+                        storage_offset=0,
+                    )
+                    cpu_storage.copy_(cpu_storage_view, non_blocking=True)
+                    cpu_tensor = cpu_storage
+                else:
+                    # No broadcast - use normal contiguous copy
+                    cpu_tensor = torch.empty_like(
+                        activation, pin_memory=self.use_pin_memory, device="cpu"
+                    )
+                    cpu_tensor.copy_(activation, non_blocking=True)
+
+                # Store CPU tensor along with stride information
+                self.tracker[tensor_id] = (
+                    cpu_tensor,
+                    True,  # True = (in future) modified
+                    original_stride,  # Save original GPU stride
+                    original_storage_offset,  # Save original storage offset
+                    original_shape,  # Save original shape for broadcast restoration
+                )
+
+            if self.use_streams:
+                event = self.s1.record_event()
+
+                # Stash to keep activation alive til s1 is done
+                self.fwd_stash[tensor_id] = (activation, event)
+
+            # Track this storage for deduplication
+            self.storage_to_tensor_id[storage_key] = tensor_id
+
+            return tensor_id
+
+        def unpack_tensor_single_stream(unpack_tensor_id: int) -> torch.Tensor:
+            # backward pass - we are called with the tensor_id, which
+            # we will use to retrieve the saved/offloaded tensor
+            if self.is_first_backward_call:
+                if self.is_first_forward_pass:
+                    self.is_first_forward_pass = False
+                    if self.use_pin_memory:
+                        verify_sufficient_virtual_memory()
+
+                self.is_first_backward_call = False
+
+            if unpack_tensor_id not in self.tracker:
+                raise ValueError(f"Untracked tensor with id {unpack_tensor_id}")
+
+            (
+                maybe_accelerator_tensor,
+                modified,
+                original_stride,
+                original_storage_offset,
+                original_shape,
+            ) = self.tracker[unpack_tensor_id]
+
+            if modified:
+                # Restore tensor to GPU
+                accelerator_tensor = maybe_accelerator_tensor.to(
+                    self.accelerator_type, non_blocking=True
+                )
+                # Restore original stride if we saved it (handles both broadcast and non-broadcast cases)
+                if original_stride is not None:
+                    accelerator_tensor = torch.as_strided(
+                        accelerator_tensor,
+                        size=original_shape,
+                        stride=original_stride,
+                        storage_offset=original_storage_offset,
+                    )
+                maybe_accelerator_tensor = accelerator_tensor
+
+            # clear tensor from tracking
+            del self.tracker[unpack_tensor_id]
+            # Only set is_first_forward_call to True when all tensors have been unpacked
+            if len(self.tracker) == 0:
+                self.is_first_forward_call = True
+            return maybe_accelerator_tensor
+
+        def unpack_tensor_with_streams(unpack_tensor_id: int) -> torch.Tensor:
+            # backward pass - we are called with the tensor_id, which
+            # we will use to retrieve the saved/offloaded tensor
+            if self.is_first_backward_call:
+                self.curr_graph_id = torch._C._current_graph_task_id()
+
+                def wait_and_del_remaining_references() -> None:
+                    for id in list(self.bwd_tensor_stash.keys()):
+                        if id in self.bwd_ev_stash:
+                            event = self.bwd_ev_stash[id]
+                            self.s1.wait_event(event)
+                        del self.bwd_tensor_stash[id]
+
+                # Register a callback to the end of autograd to clean everything up
+                torch.autograd.variable.Variable._execution_engine.queue_callback(
+                    wait_and_del_remaining_references
+                )
+
+                if self.is_first_forward_pass:
+                    self.is_first_forward_pass = False
+                    if self.use_pin_memory:
+                        verify_sufficient_virtual_memory()
+
+                self.is_first_backward_call = False
+
+            if unpack_tensor_id not in self.tracker:
+                raise ValueError(f"untracked tensor with id {unpack_tensor_id}")
+
+            (
+                maybe_accelerator_tensor,
+                modified,
+                original_stride,
+                original_storage_offset,
+                original_shape,
+            ) = self.tracker[unpack_tensor_id]
+
+            if modified:
+                # Get data on the current autograd node
+                graph_id = torch._C._current_graph_task_id()
+                node = torch._C._current_autograd_node()
+                prev_node_ids = []
+
+                # If we're on a new node, mark prev node's tensors to be freed later
+                if graph_id == self.curr_graph_id and self.curr_autograd_node != node:
+                    self.curr_autograd_node = node
+                    prev_node_ids = list(self.bwd_tensor_stash.keys())
+
+                brought_back_from_cpu = True
+                if unpack_tensor_id in self.fwd_stash:
+                    maybe_accelerator_tensor = self.fwd_stash[unpack_tensor_id][0]
+                    brought_back_from_cpu = False
+                else:
+                    # Kick off the process to bring tensors back
+                    if self.accelerator_type == "xpu":
+                        stream_ctx = torch.xpu.stream(self.s1)
+                    elif self.accelerator_type == "npu":
+                        stream_ctx = torch.npu.stream(self.s1)
+                    else:
+                        stream_ctx = torch.cuda.stream(self.s1)
+                    with stream_ctx:
+                        # Restore tensor to GPU
+                        accelerator_tensor = maybe_accelerator_tensor.to(
+                            self.accelerator_type, non_blocking=True
+                        )
+                        # Restore original stride if we saved it (handles both broadcast and non-broadcast cases)
+                        if original_stride is not None:
+                            accelerator_tensor = torch.as_strided(
+                                accelerator_tensor,
+                                size=original_shape,
+                                stride=original_stride,
+                                storage_offset=original_storage_offset,
+                            )
+                        maybe_accelerator_tensor = accelerator_tensor
+
+                    # Tell comp stream to wait for the info to be loaded before executing
+                    self.s0.wait_stream(self.s1)
+
+                    # Stash the tensor to keep memory alive until compute stream is complete
+                    self.bwd_tensor_stash[unpack_tensor_id] = maybe_accelerator_tensor
+
+                    # Note: [Track views of the unpacked]
+                    # Why do we get the use count of the unpacked tensor here? We want an
+                    # initial count to compare to later, during the post-hook of the
+                    # backward node, when we need to decide whether we're allowed to free
+                    # the tensor yet. In what obscure cases must we delay freeing the
+                    # tensor (and thus call record_stream)?
+                    # 1. Any of the outputs of the backward node is a view of the unpacked
+                    #    tensor.
+                    # 2. In the case that this unpacked tensor will be used in a
+                    #    checkpointed region, if one of the recomputed saved tensors ends
+                    #    up as a view of the unpacked tensor.
+                    # 3. The user abuses the system somehow and manually relies on the
+                    #    unpacked tensor to exist after the backward node has executed.
+                    if self.accelerator_type == "npu":
+                        storage_refcount = torch_npu._C._storage_Use_Count(
+                            maybe_accelerator_tensor.untyped_storage()._cdata
+                        )
+                    else:
+                        storage_refcount = torch._C._storage_Use_Count(
+                            maybe_accelerator_tensor.untyped_storage()._cdata
+                        )
+
+                def hook(outputs, inputs):
+                    # create events for the current node inputs/outputs if they were streamed in
+                    if brought_back_from_cpu:
+                        # See Note: [Track views of the unpacked]
+                        # IF any of the outputs is a view of the tensor, OR if a view of
+                        # the tensor has been saved as a part of checkpoint's recompute
+                        # process, OR the user has abusedly incurred a reference on the
+                        # unpacked tensor, THEN the tensor might be used later and we
+                        # cannot presume to delete it after only the current node is
+                        # done! So we use our frenemy, record_stream, to ensure the
+                        # Tensor stays unmessed with until it's done getting used in the
+                        # compute stream (s0 here). Note that the con here is we introduce
+                        # non-deterministic (thus higher) memory usage, but this case
+                        # should not happen often.
+                        # Check if tensor still exists (might have been cleaned up by a previous node)
+                        if unpack_tensor_id in self.bwd_tensor_stash:
+                            unpacked_tensor = self.bwd_tensor_stash[unpack_tensor_id]
+                            if self.accelerator_type == "npu":
+                                storage_count = torch_npu._C._storage_Use_Count(
+                                    unpacked_tensor.untyped_storage()._cdata
+                                )
+                            else:
+                                storage_count = torch._C._storage_Use_Count(
+                                    unpacked_tensor.untyped_storage()._cdata
+                                )
+                            if storage_count > storage_refcount:
+                                unpacked_tensor.record_stream(self.s0)
+                                del self.bwd_tensor_stash[unpack_tensor_id]
+                            else:
+                                event = self.s0.record_event()
+                                self.bwd_ev_stash[unpack_tensor_id] = event
+
+                    # if there are still things in the fwd_stash, get rid of them as we're in bwd now
+                    for id in list(self.fwd_stash.keys()):
+                        _, ev = self.fwd_stash[id]
+                        self.s0.wait_event(ev)
+                        del self.fwd_stash[id]
+
+                    # wait on prev node's events and del those
+                    for id in prev_node_ids:
+                        # Only wait on events that exist (some tensors may have used record_stream instead)
+                        if id in self.bwd_ev_stash:
+                            event = self.bwd_ev_stash[id]
+                            self.s1.wait_event(event)
+                            del self.bwd_ev_stash[id]
+                        if id in self.bwd_tensor_stash:
+                            del self.bwd_tensor_stash[id]
+
+                    return outputs
+
+                node.register_hook(hook)
+
+            # clear tensor from tracking
+            del self.tracker[unpack_tensor_id]
+            # Only set is_first_forward_call to True when all tensors have been unpacked
+            if len(self.tracker) == 0:
+                self.is_first_forward_call = True
+            return maybe_accelerator_tensor
+
+        unpack_tensor = (
+            unpack_tensor_with_streams
+            if self.use_streams
+            else unpack_tensor_single_stream
+        )
+        super().__init__(pack_tensor, unpack_tensor)
+
+    def update_model_params(self, model: nn.Module):
+        """
+        Update the set of parameter storage pointers from the model. This allows filtering out model parameters during
+        offloading, which is especially important for FSDP models where parameters may not be detected by isinstance
+        checks.
+
+        For FSDP v2, this method handles DTensor parameters which may be sharded across ranks and not have valid local
+        storage on all ranks. We extract the local tensor from DTensors using _local_tensor when available.
+
+        Args:
+            model: The model whose parameters should be tracked
+        """
+        param_storages = set()
+
+        for p in model.parameters():
+            # For FSDP v2: extract local tensor from DTensor
+            actual_tensor = p
+            if (
+                DTensor is not None
+                and isinstance(p, DTensor)
+                and hasattr(p, "_local_tensor")
+            ):
+                actual_tensor = p._local_tensor
+
+            # Try to get storage pointer
+            try:
+                storage_ptr = actual_tensor.untyped_storage().data_ptr()
+                if storage_ptr != 0:
+                    param_storages.add(storage_ptr)
+            except RuntimeError:
+                # Parameter doesn't have accessible storage (e.g., FSDP v2 sharded without local shard, FP8 parameters)
+                # These will be caught by other checks (isinstance for Parameter, class name for FP8)
+                continue
+
+        self.param_storages = param_storages
+
+
+class NoOpManager(saved_tensors_hooks):
+    """
+    A `saved_tensors_hook` manager used to disable any other `saved_tensors_hook` manager applied before. This relies
+    on the behavior that only the most recently registered `saved_tensors_hook` will run.
+
+    One example usage is to opt a local region of code out of activations offloading, which is usually applied globally
+    to best track state.
+    """
+
+    def __init__(self) -> None:
+        def noop(tensor):
+            return tensor
+
+        super().__init__(noop, noop)
+
+
+def get_act_offloading_ctx_manager(
+    model: nn.Module,
+    use_pin_memory: bool = True,
+    use_streams: bool = True,
+    min_offload_size: int = 1024,
+    max_fwd_stash_size: int = 5,
+    warn_if_no_head: bool = True,
+) -> OffloadActivations:
+    """
+    Returns the activation offloading context manager for the model. All but the last output Linear in every step will
+    be offloaded.
+
+    If activation offloading is enabled, we return the OffloadActivations context manager. If activation offloading is
+    disabled, we return a NoOpManager context manager.
+
+    Args:
+        model (`nn.Module`):
+            Model to wrap with the activation offloading context manager.
+        use_pin_memory (`bool`, *optional*, defaults to `True`):
+            Whether to offloaded Tensor will be placed in pinned memory on the CPU. Pinned memory allows the Tensor to
+            be moved back onto GPU more quickly but is a limited resource.
+        use_streams (`bool`, *optional*, defaults to `True`):
+            Whether to use streams for performance optimization where the communications get overlapped with the
+            computation. Requires a torch build after torch-2.5.0.
+        min_offload_size (`int`, *optional*, defaults to `1024`):
+            Minimum number of bytes a Tensor must be in order to qualify for offloading. If the tensor is too small, we
+            do not want to waste bandwidth and resources moving it to CPU and back.
+        max_fwd_stash_size (`int`, *optional*, defaults to `5`):
+            Maximum size of the forward stash, or the maximum number of consecutive activations to keep alive during
+            the forward pass. This number must be at least 1. Keeping alive more activations will potentially allow
+            more overlap between the communication and compute streams at the cost of increasing memory usage. Keeping
+            alive fewer activations will conserve memory, but may cause poor overlap between the streams, increasing
+            runtime.
+        warn_if_no_head (`bool`, *optional*, defaults to `True`):
+            Whether to warn if no output head is detected. If set to `False`, no warning will be raised if no output
+            head is detected.
+
+    Returns
+    -------
+        `contextlib.ContextDecorator`:
+            Activation offloading context manager for the model.
+    """
+    activations_handling_ctx = OffloadActivations(
+        use_pin_memory=use_pin_memory,
+        use_streams=use_streams,
+        min_offload_size=min_offload_size,
+        max_fwd_stash_size=max_fwd_stash_size,
+    )
+
+    # Update parameter storages to filter them during offloading (important for FSDP)
+    activations_handling_ctx.update_model_params(model)
+
+    # Below is our hack to disable offloading the last output Linear in every
+    # step, as the cost for offloading the activation and then soon after bringing
+    # it back is expensive.
+    output_head_detected = False
+    noop_ctx = NoOpManager()
+
+    # Try to get the actual model if it's wrapped
+    unwrapped_model = model
+    if hasattr(unwrapped_model, "module"):
+        unwrapped_model = unwrapped_model.module
+    # check for PEFT models
+    if hasattr(unwrapped_model, "base_model") and hasattr(
+        unwrapped_model, "peft_config"
+    ):
+        unwrapped_model = unwrapped_model.base_model
+
+    # Check for different types of output heads
+    if hasattr(unwrapped_model, "output"):
+        if isinstance(unwrapped_model.output, nn.Module):
+            unwrapped_model.output.register_forward_pre_hook(
+                lambda *args: noop_ctx.__enter__()
+            )
+            unwrapped_model.output.register_forward_hook(
+                lambda *args: noop_ctx.__exit__(), always_call=True
+            )
+            output_head_detected = True
+        elif hasattr(unwrapped_model.output, "linear") and isinstance(
+            unwrapped_model.output.linear, nn.Module
+        ):
+            unwrapped_model.output.linear.register_forward_pre_hook(
+                lambda *args: noop_ctx.__enter__()
+            )
+            unwrapped_model.output.linear.register_forward_hook(
+                lambda *args: noop_ctx.__exit__(), always_call=True
+            )
+            output_head_detected = True
+
+    # Check for HuggingFace model output heads
+    elif hasattr(unwrapped_model, "lm_head"):
+        unwrapped_model.lm_head.register_forward_pre_hook(
+            lambda *args: noop_ctx.__enter__()
+        )
+        unwrapped_model.lm_head.register_forward_hook(
+            lambda *args: noop_ctx.__exit__(), always_call=True
+        )
+        output_head_detected = True
+
+    # Check for decoder-based models
+    elif hasattr(unwrapped_model, "decoder"):
+        decoder = unwrapped_model.decoder
+        if hasattr(decoder, "output"):
+            decoder.output.register_forward_pre_hook(lambda *args: noop_ctx.__enter__())
+            decoder.output.register_forward_hook(
+                lambda *args: noop_ctx.__exit__(), always_call=True
+            )
+            output_head_detected = True
+        # Some models have lm_head in the decoder
+        elif hasattr(decoder, "lm_head"):
+            decoder.lm_head.register_forward_pre_hook(
+                lambda *args: noop_ctx.__enter__()
+            )
+            decoder.lm_head.register_forward_hook(
+                lambda *args: noop_ctx.__exit__(), always_call=True
+            )
+            output_head_detected = True
+
+    # Check for transformer models with final layer norm
+    elif hasattr(unwrapped_model, "final_layer_norm") or hasattr(
+        unwrapped_model, "ln_f"
+    ):
+        final_norm = (
+            getattr(unwrapped_model, "final_layer_norm", None) or unwrapped_model.ln_f
+        )
+        final_norm.register_forward_pre_hook(lambda *args: noop_ctx.__enter__())
+        final_norm.register_forward_hook(
+            lambda *args: noop_ctx.__exit__(), always_call=True
+        )
+        output_head_detected = True
+
+    # Check for models with head module
+    elif hasattr(unwrapped_model, "head") and isinstance(
+        unwrapped_model.head, nn.Module
+    ):
+        unwrapped_model.head.register_forward_pre_hook(
+            lambda *args: noop_ctx.__enter__()
+        )
+        unwrapped_model.head.register_forward_hook(
+            lambda *args: noop_ctx.__exit__(), always_call=True
+        )
+        output_head_detected = True
+
+    if not output_head_detected and warn_if_no_head:
+        logger.warning(
+            "During activation offloading, no output head was detected. If your model has an output head, it will be "
+            "offloaded. This usually greatly slows training, given the large vocabulary size. To change this "
+            "behavior, set your output head as model.output and make it an nn.Module. You can disable this warning by "
+            "passing `warn_if_no_head=False`."
+        )
+
+    # Disable offloading for any Liger modules
+    for name, module in unwrapped_model.named_modules():
+        if "liger" in name.lower():
+            module.register_forward_pre_hook(lambda *args: noop_ctx.__enter__())
+            module.register_forward_hook(
+                lambda *args: noop_ctx.__exit__(), always_call=True
+            )
+
+    return activations_handling_ctx
diff --git a/src/aixpert/training/training/trl/models/modeling_base.py b/src/aixpert/training/training/trl/models/modeling_base.py
new file mode 100644
index 0000000..d7703c4
--- /dev/null
+++ b/src/aixpert/training/training/trl/models/modeling_base.py
@@ -0,0 +1,839 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+from copy import deepcopy
+
+import torch
+from accelerate import PartialState
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    HFValidationError,
+    LocalEntryNotFoundError,
+    RepositoryNotFoundError,
+)
+from safetensors.torch import load_file as safe_load_file
+from torch import nn
+from transformers import (
+    GenerationMixin,
+    PreTrainedModel,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+)
+from transformers.utils import is_peft_available
+
+
+if is_peft_available():
+    from peft import (
+        PeftConfig,
+        PeftModel,
+        PeftModelForCausalLM,
+        PeftModelForSeq2SeqLM,
+        PromptLearningConfig,
+        get_peft_model,
+        prepare_model_for_kbit_training,
+    )
+
+
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+
+
+LAYER_PATTERNS = [
+    "transformer.h.{layer}",
+    "model.decoder.layers.{layer}",
+    "gpt_neox.layers.{layer}",
+    "model.layers.{layer}",
+]
+
+
+class PreTrainedModelWrapper(nn.Module):
+    """
+    Wrapper for a [`~transformers.PreTrainedModel`] implemented as a standard PyTorch [`torch.nn.Module`].
+
+    This class provides a compatibility layer that preserves the key attributes and methods of the original
+    [`~transformers.PreTrainedModel`], while exposing a uniform interface consistent with PyTorch modules. It enables
+    seamless integration of pretrained Transformer models into custom training, evaluation, or inference workflows.
+
+    Attributes
+    ----------
+        pretrained_model ([`~transformers.PreTrainedModel`]):
+            The model to be wrapped.
+        parent_class ([`~transformers.PreTrainedModel`]):
+            The parent class of the model to be wrapped.
+        supported_args (`list`):
+            The list of arguments that are supported by the wrapper class.
+    """
+
+    transformers_parent_class = None
+    supported_args = None
+    supported_modules = ("v_head",)
+    supported_rm_modules = ("score",)
+    supported_pretrained_model_architectures = (
+        (PreTrainedModel)
+        if not is_peft_available()
+        else (PreTrainedModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM)
+    )
+
+    def __init__(
+        self,
+        pretrained_model=None,
+        score_module=None,
+        supports_rm_adapter=False,
+        rm_adapter_name=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.pretrained_model = pretrained_model
+
+        self.config = pretrained_model.config
+        self.prepare_inputs_for_generation = (
+            pretrained_model.prepare_inputs_for_generation
+        )
+        self.is_loaded_in_8bit = getattr(pretrained_model, "is_loaded_in_8bit", False)
+        self.is_loaded_in_4bit = getattr(pretrained_model, "is_loaded_in_4bit", False)
+        self.is_sequential_parallel = False
+
+        if hasattr(pretrained_model, "gradient_checkpointing_disable"):
+            self.gradient_checkpointing_disable = (
+                pretrained_model.gradient_checkpointing_disable
+            )
+
+        if hasattr(pretrained_model, "gradient_checkpointing_enable"):
+            self.gradient_checkpointing_enable = (
+                pretrained_model.gradient_checkpointing_enable
+            )
+
+        if hasattr(pretrained_model, "enable_input_require_grads"):
+            self.enable_input_require_grads = (
+                pretrained_model.enable_input_require_grads
+            )
+
+        self.supports_rm_adapter = supports_rm_adapter
+        self.rm_adapter_name = rm_adapter_name
+        self.policy_adapter_name = "default"
+        if score_module is not None:
+            self.score = score_module
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""
+        Instantiates a new model from a pretrained model from `transformers`. The pretrained model is loaded using the
+        `from_pretrained` method of the [`~transformers.PreTrainedModel`] class. The arguments that are specific to the
+        [`~transformers.PreTrainedModel`] class are passed along this method and filtered out from the `kwargs`
+        argument.
+
+        Args:
+            pretrained_model_name_or_path (`str` or [`~transformers.PreTrainedModel`]):
+                The path to the pretrained model or its name.
+            *model_args (`list`, *optional*):
+                Additional positional arguments passed along to the underlying model's `from_pretrained` method.
+            **kwargs (`dict`, *optional*):
+                Additional keyword arguments passed along to the underlying model's `from_pretrained` method. We also
+                pre-process the kwargs to extract the arguments that are specific to the
+                [`~transformers.PreTrainedModel`] class and the arguments that are specific to trl models. The kwargs
+                also support `prepare_model_for_kbit_training` arguments from `peft` library.
+        """
+        if kwargs is not None:
+            peft_config = kwargs.pop("peft_config", None)
+            reward_adapter = kwargs.pop("reward_adapter", None)
+            reward_adapter_name = kwargs.pop("reward_adapter_name", "reward_adapter")
+            is_trainable = kwargs.pop("is_trainable", False)
+            trl_model_args, pretrained_kwargs, peft_quantization_kwargs = (
+                cls._split_kwargs(kwargs)
+            )
+            token = pretrained_kwargs.get("token", None)
+        else:
+            peft_config = None
+            is_trainable = False
+            trl_model_args = {}
+            pretrained_kwargs = {}
+            peft_quantization_kwargs = {}
+            token = None
+
+        if reward_adapter is not None and not isinstance(reward_adapter, str):
+            raise ValueError(
+                "The `reward_adapter` argument should be a string representing the name of local path or the Hub id to the Reward Modeling adapter."
+            )
+
+        is_peft_model = False
+
+        current_device = cls._get_current_device()
+        if isinstance(pretrained_model_name_or_path, str):
+            quantization_config = pretrained_kwargs.get("quantization_config")
+            if quantization_config is not None:
+                is_loaded_in_8bit = getattr(quantization_config, "load_in_8bit", False)
+                is_loaded_in_4bit = getattr(quantization_config, "load_in_4bit", False)
+            else:
+                is_loaded_in_8bit = (
+                    pretrained_kwargs["load_in_8bit"]
+                    if "load_in_8bit" in pretrained_kwargs
+                    else False
+                )
+                is_loaded_in_4bit = (
+                    pretrained_kwargs["load_in_4bit"]
+                    if "load_in_4bit" in pretrained_kwargs
+                    else False
+                )
+        else:
+            is_loaded_in_8bit = getattr(
+                pretrained_model_name_or_path, "is_loaded_in_8bit", False
+            )
+            is_loaded_in_4bit = getattr(
+                pretrained_model_name_or_path, "is_loaded_in_4bit", False
+            )
+
+        if (
+            is_loaded_in_8bit or is_loaded_in_4bit
+        ) and "device_map" not in pretrained_kwargs:
+            # warn users
+            logging.warning(
+                "The `device_map` argument is not provided. We will override the device_map argument."
+                " to set the entire"
+                " model on the current device. If you want to set the model on multiple devices, please provide"
+                " a custom `device_map` argument."
+            )
+            pretrained_kwargs["device_map"] = {"": current_device}
+
+        if (
+            is_peft_available()
+            and peft_config is not None
+            and not isinstance(peft_config, PeftConfig)
+        ):
+            raise ValueError(
+                "The `peft_config` argument should be an instance of `peft.PeftConfig` class."
+            )
+
+        # First, load the pre-trained model using the parent-class
+        # either `AutoModelForCausalLM` or `AutoModelForSeq2SeqLM`
+        if isinstance(pretrained_model_name_or_path, str):
+            if is_peft_available():
+                try:
+                    # If there is a trained peft adapter in the hub, load its config.
+                    remote_adapter_config = hf_hub_download(
+                        pretrained_model_name_or_path,
+                        "adapter_config.json",
+                        token=token,
+                    )
+                except (
+                    EntryNotFoundError,
+                    LocalEntryNotFoundError,
+                    HFValidationError,
+                    RepositoryNotFoundError,
+                ):
+                    remote_adapter_config = None
+            else:
+                remote_adapter_config = None
+
+            local_adapter_present = os.path.exists(
+                os.path.join(pretrained_model_name_or_path, "adapter_config.json")
+            )
+
+            if (
+                local_adapter_present or remote_adapter_config is not None
+            ) and is_peft_available():
+                if peft_config is not None:
+                    logging.warning(
+                        "`peft_config` argument ignored since a peft config file was found in "
+                        f"{pretrained_model_name_or_path}"
+                    )
+
+                # Load the trained peft adapter config
+                if local_adapter_present:
+                    trained_adapter_config = PeftConfig.from_pretrained(
+                        pretrained_model_name_or_path
+                    )
+                else:
+                    remote_adapter_dir = os.path.dirname(remote_adapter_config)
+                    trained_adapter_config = PeftConfig.from_pretrained(
+                        remote_adapter_dir
+                    )
+
+                # Load the pretrained base model
+                pretrained_model = cls.transformers_parent_class.from_pretrained(
+                    trained_adapter_config.base_model_name_or_path,
+                    *model_args,
+                    **pretrained_kwargs,
+                )
+
+                # Wrap the pretrained model with the trained peft adapter
+                pretrained_model = PeftModel.from_pretrained(
+                    pretrained_model,
+                    pretrained_model_name_or_path,
+                    is_trainable=is_trainable,
+                    token=token,
+                )
+                logging.info("Trained peft adapter loaded")
+            else:
+                pretrained_model = cls.transformers_parent_class.from_pretrained(
+                    pretrained_model_name_or_path, *model_args, **pretrained_kwargs
+                )
+
+                if peft_config is not None:
+                    # Initialize a new peft adapter with the given config
+                    if is_loaded_in_8bit or is_loaded_in_4bit:
+                        pretrained_model = prepare_model_for_kbit_training(
+                            pretrained_model,
+                            **peft_quantization_kwargs,
+                        )
+                    pretrained_model = get_peft_model(pretrained_model, peft_config)
+                    logging.info("peft adapter initialised")
+
+        elif isinstance(
+            pretrained_model_name_or_path, cls.supported_pretrained_model_architectures
+        ):
+            pretrained_model = pretrained_model_name_or_path
+
+            if peft_config is not None and isinstance(
+                pretrained_model, PreTrainedModel
+            ):
+                # Initialize a new peft adapter with the given config
+                if is_loaded_in_8bit or is_loaded_in_4bit:
+                    pretrained_model = prepare_model_for_kbit_training(
+                        pretrained_model,
+                        **peft_quantization_kwargs,
+                    )
+                pretrained_model = get_peft_model(pretrained_model, peft_config)
+                logging.info("peft adapter initialised")
+        else:
+            raise ValueError(
+                "pretrained_model_name_or_path should be a string or a PreTrainedModel, "
+                f"but is {type(pretrained_model_name_or_path)}"
+            )
+
+        if is_peft_available():
+            if isinstance(pretrained_model, PeftModel):
+                is_peft_model = True
+                # for backward compatibility
+                if hasattr(pretrained_model, "active_peft_config") and isinstance(
+                    pretrained_model.active_peft_config, PromptLearningConfig
+                ):
+                    raise ValueError(
+                        "PromptLearningConfig is not supported for PPO training."
+                    )
+
+        # Add reward modeling adapter if specified
+        if not is_peft_model and reward_adapter is not None:
+            raise ValueError("reward_adapter can only be used with a PeftModel. ")
+        if is_peft_model and reward_adapter is not None:
+            score_module = cls.add_and_load_reward_modeling_adapter(
+                pretrained_model, reward_adapter, reward_adapter_name, token=token
+            )
+            multi_adapter_args = {
+                "score_module": score_module,
+                "supports_rm_adapter": True,
+                "rm_adapter_name": reward_adapter_name,
+            }
+        else:
+            multi_adapter_args = {"supports_rm_adapter": False}
+
+        # Then, create the full model by instantiating the wrapper class
+        model = cls(pretrained_model, **multi_adapter_args, **trl_model_args)
+
+        # if resume_training, load the state_dict again - this is ok since the
+        # state_dict is removed from the model after loading it.
+        is_resuming_training = True
+        if isinstance(pretrained_model_name_or_path, str):
+            safe_filename = os.path.join(
+                pretrained_model_name_or_path, "model.safetensors"
+            )
+            filename = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin")
+
+            sharded_index_filename = os.path.join(
+                pretrained_model_name_or_path, "pytorch_model.bin.index.json"
+            )
+            safe_sharded_index_filename = os.path.join(
+                pretrained_model_name_or_path, "model.safetensors.index.json"
+            )
+            is_sharded = False
+            use_safe = os.path.exists(safe_filename)
+
+            if not (os.path.exists(filename) or os.path.exists(safe_filename)):
+                # Try with `pytorch_model.bin`
+                filename, files_to_download, is_sharded, is_resuming_training = (
+                    cls._get_checkpoint_from_hub(
+                        pretrained_model,
+                        pretrained_model_name_or_path,
+                        sharded_index_filename,
+                        token=token,
+                    )
+                )
+                # Try with safetensors
+                if filename is None and files_to_download is None:
+                    (
+                        safe_filename,
+                        files_to_download,
+                        is_sharded,
+                        is_resuming_training,
+                    ) = cls._get_checkpoint_from_hub(
+                        pretrained_model,
+                        pretrained_model_name_or_path,
+                        safe_sharded_index_filename,
+                        token=token,
+                        model_name="model.safetensors",
+                        model_index_name="model.safetensors.index.json",
+                    )
+                    use_safe = True
+                else:
+                    use_safe = False
+
+            loading_func = safe_load_file if use_safe else torch.load
+            load_kwargs = (
+                {} if use_safe else {"map_location": "cpu", "weights_only": True}
+            )
+
+            if is_resuming_training:
+                if is_sharded:
+                    # download each file and add it to the state_dict
+                    state_dict = {}
+
+                    for shard_file in files_to_download:
+                        filename = hf_hub_download(
+                            pretrained_model_name_or_path,
+                            shard_file,
+                            token=token,
+                        )
+                        state_dict.update(loading_func(filename, **load_kwargs))
+                else:
+                    state_dict = loading_func(
+                        filename if not use_safe else safe_filename, **load_kwargs
+                    )
+
+        else:
+            state_dict = pretrained_model_name_or_path.state_dict()
+
+        model.is_peft_model = is_peft_model
+        model.current_device = current_device
+
+        if is_resuming_training:
+            model.post_init(state_dict=state_dict)
+
+        return model
+
+    @classmethod
+    def _get_checkpoint_from_hub(
+        cls,
+        pretrained_model,
+        pretrained_model_name_or_path,
+        index_filename,
+        token=None,
+        model_name="pytorch_model.bin",
+        model_index_name="pytorch_model.bin.index.json",
+    ):
+        files_to_download = None
+        filename = None
+        is_resuming_training = True
+        is_sharded = False
+
+        try:
+            filename = hf_hub_download(
+                pretrained_model_name_or_path,
+                model_name,
+                token=token,
+            )
+        # sharded
+        except (
+            EntryNotFoundError,
+            LocalEntryNotFoundError,
+            HFValidationError,
+            RepositoryNotFoundError,
+        ):
+            if os.path.exists(index_filename):
+                index_file_name = index_filename
+            else:
+                try:
+                    index_file_name = hf_hub_download(
+                        pretrained_model_name_or_path,
+                        model_index_name,
+                        token=token,
+                    )
+                except (
+                    EntryNotFoundError,
+                    LocalEntryNotFoundError,
+                    HFValidationError,
+                    RepositoryNotFoundError,
+                ):
+                    # not continue training, do not have v_head weight
+                    is_resuming_training = False
+                    logging.warning(
+                        f"A {type(pretrained_model)} model is loaded from '{pretrained_model_name_or_path}', "
+                        f"and no v_head weight is found. This IS expected if you are not resuming PPO training."
+                    )
+            # load json
+            if is_resuming_training:
+                with open(index_file_name) as f:
+                    index = json.load(f)
+                # check filename with `v_head` or any known extra module:
+                files_to_download = set()
+                for k, v in index["weight_map"].items():
+                    if any(module in k for module in cls.supported_modules):
+                        files_to_download.add(v)
+                is_sharded = True
+
+        return filename, files_to_download, is_sharded, is_resuming_training
+
+    @classmethod
+    def _get_current_device(cls):
+        r"""
+        Get the current device. For GPU & XPU, we return the local process index using the `accelerate.PartialState`
+        object to handle corner cases when running scripts in distributed environments.
+
+        Returns
+        -------
+            current_device (`int | str`):
+                The current device.
+        """
+        state = PartialState()
+        if torch.cuda.is_available() or is_torch_xpu_available():
+            return state.local_process_index
+        if is_torch_npu_available():
+            return f"npu:{state.local_process_index}"
+        return "cpu"
+
+    @classmethod
+    def _split_kwargs(cls, kwargs):
+        """
+        Separate the kwargs from the arguments that we support inside `supported_args` and the ones that we don't.
+        """
+        check_peft_kwargs = False
+
+        if is_peft_available():
+            from peft import prepare_model_for_kbit_training
+
+            check_peft_kwargs = True
+
+        supported_kwargs = {}
+        unsupported_kwargs = {}
+        peft_kwargs = {}
+
+        for key, value in kwargs.items():
+            if key in cls.supported_args:
+                supported_kwargs[key] = value
+            else:
+                unsupported_kwargs[key] = value
+
+            if check_peft_kwargs:
+                if key in prepare_model_for_kbit_training.__code__.co_varnames:
+                    peft_kwargs[key] = value
+                    if key in unsupported_kwargs:
+                        unsupported_kwargs.pop(key)
+
+        return supported_kwargs, unsupported_kwargs, peft_kwargs
+
+    @classmethod
+    def add_and_load_reward_modeling_adapter(
+        cls,
+        pretrained_model,
+        adapter_model_id,
+        adapter_name="reward_model_adapter",
+        token=None,
+    ):
+        r"""
+        Add and load a reward modeling adapter. This method can only be used if the model is a `PeftModel` and if you
+        have initialized the model with the `reward_modeling_adapter_id` argument, pointing to the id of the reward
+        modeling adapter. The latest needs also to contain the score head in order to produce the reward.
+        """
+        pretrained_model.load_adapter(
+            adapter_model_id, adapter_name, is_trainable=False
+        )
+        pretrained_model.train()
+
+        filename = os.path.join(adapter_model_id, "adapter_model.bin")
+        safe_loading = False
+        if not os.path.exists(filename):
+            try:
+                local_filename = hf_hub_download(
+                    adapter_model_id,
+                    "adapter_model.bin",
+                    token=token,
+                )
+            except Exception:
+                filename = os.path.join(adapter_model_id, "adapter_model.safetensors")
+                safe_loading = True
+                if not os.path.exists(filename):
+                    try:
+                        local_filename = hf_hub_download(
+                            adapter_model_id,
+                            "adapter_model.safetensors",
+                            token=token,
+                        )
+                    except Exception as exc:
+                        raise ValueError(
+                            "Could not find adapter model in the Hub, make sure you have the correct adapter model id."
+                        ) from exc
+                else:
+                    local_filename = filename
+        else:
+            local_filename = filename
+
+        loading_func = safe_load_file if safe_loading else torch.load
+        load_kwargs = (
+            {} if safe_loading else {"map_location": "cpu", "weights_only": True}
+        )
+
+        adapter_state_dict = loading_func(local_filename, **load_kwargs)
+
+        for score_name_candidate in cls.supported_rm_modules:
+            if any(score_name_candidate in name for name in adapter_state_dict.keys()):
+                score_name = score_name_candidate
+                # we have found the correct head name and can break
+                break
+
+        score_dict = {}
+
+        for name, param in adapter_state_dict.items():
+            if score_name in name:
+                key_name = ".".join(name.split(".")[-1:])
+                score_dict[key_name] = param.to(cls._get_current_device())
+
+        num_labels, hidden_dim = score_dict["weight"].shape
+        has_bias = any("bias" in name for name in adapter_state_dict.keys())
+
+        score = nn.Linear(hidden_dim, num_labels, bias=has_bias).to(
+            device=cls._get_current_device(),
+            dtype=pretrained_model.dtype,
+        )
+        score.load_state_dict(score_dict)
+        for param in score.parameters():
+            param.requires_grad = False
+
+        return score
+
+    def push_to_hub(self, *args, **kwargs):
+        r"""
+        Push the pretrained model to the hub. This method is a wrapper around
+        [`~transformers.PreTrainedModel.push_to_hub`]. Please refer to the documentation of
+        [`~transformers.PreTrainedModel.push_to_hub`] for more information.
+
+        Args:
+            *args (`list`, *optional*):
+                Positional arguments passed along to the underlying model's `push_to_hub` method.
+            **kwargs (`dict`, *optional*):
+                Keyword arguments passed along to the underlying model's `push_to_hub` method.
+        """
+        raise NotImplementedError
+
+    def save_pretrained(self, *args, **kwargs):
+        r"""
+        Save the pretrained model to a directory. This method is a wrapper around
+        [`~transformers.PreTrainedModel.save_pretrained`]. Please refer to the documentation of
+        [`~transformers.PreTrainedModel.save_pretrained`] for more information.
+
+        Args:
+            *args (`list`, *optional*):
+                Positional arguments passed along to the underlying model's `save_pretrained` method.
+            **kwargs (`dict`, *optional*):
+                Keyword arguments passed along to the underlying model's `save_pretrained` method.
+        """
+        state_dict = kwargs.get("state_dict")
+        if state_dict is None:
+            state_dict = self.state_dict()
+            kwargs["state_dict"] = state_dict
+
+        # if it is a peft model only save the `v_head` state_dict and
+        # pop the `state_dict` from the kwargs to avoid silent bugs with `peft`
+        if self.is_peft_model:
+            save_path = args[0]
+            save_path = os.path.join(save_path, "pytorch_model.bin")
+            torch.save(state_dict, save_path)
+            _ = kwargs.pop("state_dict", None)
+
+        return self.pretrained_model.save_pretrained(*args, **kwargs)
+
+    def state_dict(self, *args, **kwargs):
+        r"""
+        Return the state_dict of the pretrained model.
+        """
+        raise NotImplementedError
+
+    def post_init(self, *args, **kwargs):
+        r"""
+        Post initialization method. This method is called after the model is instantiated and loaded from a checkpoint.
+        It can be used to perform additional operations such as loading the state_dict.
+        """
+        raise NotImplementedError
+
+    def compute_reward_score(self, input_ids, attention_mask=None, **kwargs):
+        r"""
+        Computes the reward score for a given input. The method has first to enable the adapter and then compute the
+        reward score. After that the model disables the reward modeling adapter and enables the default ppo adapter
+        again.
+        """
+        if not self.supports_rm_adapter:
+            raise ValueError("This model does not support reward modeling adapter.")
+
+        # enable rm adapter
+        self.pretrained_model.set_adapter(self.rm_adapter_name)
+        self.pretrained_model.eval()
+
+        with torch.no_grad():
+            base_model_output = self.pretrained_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_hidden_states=True,
+                return_dict=True,
+                **kwargs,
+            )
+
+            last_hidden_states = base_model_output.hidden_states[-1]
+            scores = self.score(last_hidden_states)
+
+        self.pretrained_model.set_adapter(self.policy_adapter_name)
+        self.pretrained_model.eval()
+
+        return scores
+
+
+def create_reference_model(
+    model: PreTrainedModelWrapper,
+    num_shared_layers: int | None = None,
+    pattern: str | None = None,
+) -> PreTrainedModelWrapper:
+    """
+    Creates a static reference copy of a model. Note that model will be in `.eval()` mode.
+
+    Args:
+        model ([`PreTrainedModelWrapper`]): The model to be copied.
+        num_shared_layers (`int`, *optional*):
+            The number of initial layers that are shared between both models and kept frozen.
+        pattern (`str`, *optional*): The shared layers are selected with a string pattern
+            (e.g. "transformer.h.{layer}" for GPT2) and if a custom pattern is necessary it can be passed here.
+
+    Returns
+    -------
+        [`PreTrainedModelWrapper`]
+    """
+    if is_deepspeed_zero3_enabled():
+        raise ValueError(
+            "DeepSpeed ZeRO-3 is enabled and is not compatible with `create_reference_model()`. Please instantiate your reference model directly with `AutoModelForCausalLM.from_pretrained()`."
+        )
+
+    parameter_names = [n for n, _ in model.named_parameters()]
+    ref_model = deepcopy(model)
+
+    # if no layers are shared, return copy of model
+    if num_shared_layers is None:
+        for param_name in parameter_names:
+            param = ref_model.get_parameter(param_name)
+            param.requires_grad = False
+        return ref_model.eval()
+
+    # identify layer name pattern
+    if pattern is not None:
+        pattern = pattern.format(layer=num_shared_layers)
+    else:
+        for pattern_candidate in LAYER_PATTERNS:
+            pattern_candidate = pattern_candidate.format(layer=num_shared_layers)
+            if any(pattern_candidate in name for name in parameter_names):
+                pattern = pattern_candidate
+                break
+
+    if pattern is None:
+        raise ValueError("Layer pattern could not be matched.")
+
+    # divide parameters in shared and unshared parameter lists
+    shared_param_list = []
+    unshared_param_list = []
+
+    shared_parameter = True
+    for name, _param in model.named_parameters():
+        if pattern in name:
+            shared_parameter = False
+        if shared_parameter:
+            shared_param_list.append(name)
+        else:
+            unshared_param_list.append(name)
+
+    # create reference of the original parameter if they are shared
+    for param_name in shared_param_list:
+        param = model.get_parameter(param_name)
+        param.requires_grad = False
+
+        _ref_param = ref_model.get_parameter(param_name)
+
+    # for all other parameters just make sure they don't use gradients
+    for param_name in unshared_param_list:
+        param = ref_model.get_parameter(param_name)
+        param.requires_grad = False
+
+    if pattern is not None and len(unshared_param_list) == 0:
+        logging.warning(
+            "Pattern passed or found, but no layers matched in the model. Check for a typo."
+        )
+
+    return ref_model.eval()
+
+
+class GeometricMixtureWrapper(GenerationMixin):
+    """
+    Geometric Mixture generation wrapper that samples from the logits of two model's geometric mixture.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): The model to be wrapped.
+        ref_model ([`~transformers.PreTrainedModel`]): The reference model.
+        generation_config ([`~transformers.GenerationConfig`]): The generation config.
+        mixture_coef (`float`, *optional* - default: 0.5): The mixture coefficient.
+    """
+
+    main_input_name = "input_ids"
+    _supports_cache_class = False
+    _supports_static_cache = False
+    _is_stateful = False
+
+    def __init__(
+        self, model, ref_model, generation_config, mixture_coef=0.5, device=None
+    ):
+        super().__init__()
+
+        self.model = model
+        self.config = model.config
+        self.ref_model = ref_model
+        self.generation_config = generation_config
+        self.mixture_coef = mixture_coef
+        self.device = device
+        if hasattr(self.model, "_is_stateful"):
+            self._is_stateful = self.model._is_stateful
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    @torch.inference_mode()
+    def forward(self, *args, **kwargs):
+        model_outputs = self.model(*args, **kwargs)
+        model_logits = model_outputs.logits
+        ref_model_logits = self.ref_model(*args, **kwargs).logits
+
+        model_outputs.logits = torch.nn.functional.log_softmax(
+            self.mixture_coef * ref_model_logits
+            + (1 - self.mixture_coef) * model_logits,
+            dim=-1,
+        )
+
+        return model_outputs
+
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        # turn off cache in the generation config
+        kwargs["use_cache"] = False
+        model_inputs = self.model.prepare_inputs_for_generation(*args, **kwargs)
+        _ = self.ref_model.prepare_inputs_for_generation(*args, **kwargs)
+
+        return model_inputs
+
+    def _validate_model_class(self):
+        self.model._validate_model_class()
+
+    def _validate_model_kwargs(self, model_kwargs):
+        return self.model._validate_model_kwargs(model_kwargs)
diff --git a/src/aixpert/training/training/trl/models/modeling_value_head.py b/src/aixpert/training/training/trl/models/modeling_value_head.py
new file mode 100644
index 0000000..db6eff5
--- /dev/null
+++ b/src/aixpert/training/training/trl/models/modeling_value_head.py
@@ -0,0 +1,454 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import nn
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+)
+
+from .modeling_base import PreTrainedModelWrapper
+
+
+class ValueHead(nn.Module):
+    r"""
+    The ValueHead class implements a head for GPT2 that returns a scalar for each output token.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__()
+        if not hasattr(config, "summary_dropout_prob"):
+            summary_dropout_prob = kwargs.pop("summary_dropout_prob", 0.1)
+        else:
+            summary_dropout_prob = config.summary_dropout_prob
+
+        self.dropout = (
+            nn.Dropout(summary_dropout_prob) if summary_dropout_prob else nn.Identity()
+        )
+
+        # some models such as OPT have a projection layer before the word embeddings - e.g. OPT-350m
+        if hasattr(config, "hidden_size"):
+            hidden_size = config.hidden_size
+        if hasattr(config, "word_embed_proj_dim"):
+            hidden_size = config.word_embed_proj_dim
+        elif hasattr(config, "is_encoder_decoder"):
+            if config.is_encoder_decoder and hasattr(config, "decoder"):
+                if hasattr(config.decoder, "hidden_size"):
+                    hidden_size = config.decoder.hidden_size
+
+        self.summary = nn.Linear(hidden_size, 1)
+
+        self.flatten = nn.Flatten()
+
+    def forward(self, hidden_states):
+        output = self.dropout(hidden_states)
+
+        # For now force upcast in fp32 if needed. Let's keep the
+        # output in fp32 for numerical stability.
+        if output.dtype != self.summary.weight.dtype:
+            output = output.to(self.summary.weight.dtype)
+
+        output = self.summary(output)
+        return output
+
+
+class AutoModelForCausalLMWithValueHead(PreTrainedModelWrapper):
+    """
+    An autoregressive model with a value head in addition to the language model head. This class inherits from
+    [`PreTrainedModelWrapper`] and wraps a [`~transformers.PreTrainedModel`] class. The wrapper class supports classic
+    functions such as `from_pretrained`, `push_to_hub` and `generate`. To call a method of the wrapped model, simply
+    manipulate the `pretrained_model` attribute of this class.
+
+    Class attributes:
+        - **transformers_parent_class** ([`~transformers.PreTrainedModel`]) -- The parent class of the wrapped model.
+          This
+            should be set to `transformers.AutoModelForCausalLM` for this class.
+        - **supported_args** (`tuple`) -- A tuple of strings that are used to identify the arguments that are supported
+            by the [`ValueHead`] class. Currently, the supported args are:
+            - **summary_dropout_prob** (`float`, `optional`, defaults to `None`) -- The dropout probability for the
+                [`ValueHead`] class.
+            - **v_head_initializer_range** (`float`, `optional`, defaults to `0.2`) -- The initializer range for the
+                [`ValueHead`] if a specific initialization strategy is selected.
+            - **v_head_init_strategy** (`str`, `optional`, defaults to `None`) -- The initialization strategy for the
+                [`ValueHead`]. Currently, the supported strategies are:
+                - **`None`** -- Initializes the weights of the [`ValueHead`] with a random distribution. This is the
+                  default strategy.
+                - **"normal"** -- Initializes the weights of the [`ValueHead`] with a normal distribution.
+    """
+
+    transformers_parent_class = AutoModelForCausalLM
+    supported_args = (
+        "summary_dropout_prob",
+        "v_head_initializer_range",
+        "v_head_init_strategy",
+    )
+
+    def __init__(self, pretrained_model, **kwargs):
+        """
+        Initializes the model.
+
+        Args:
+            pretrained_model ([`~transformers.PreTrainedModel`]):
+                The model to wrap. It should be a causal language model such as GPT2. or any model mapped inside the
+                `AutoModelForCausalLM` class.
+            kwargs (`dict`, `optional`):
+                Additional keyword arguments, that are passed to the [`ValueHead`] class.
+        """
+        super().__init__(pretrained_model, **kwargs)
+        v_head_kwargs, _, _ = self._split_kwargs(kwargs)
+        self.v_head = ValueHead(self.pretrained_model.config, **v_head_kwargs)
+        self._init_weights(**v_head_kwargs)
+
+    def _init_weights(self, **kwargs):
+        r"""
+        Initializes the weights of the value head. The default initialization strategy is random. Users can pass a
+        different initialization strategy by passing the `v_head_init_strategy` argument when calling
+        `.from_pretrained`. Supported strategies are:
+        - `normal`: initializes the weights with a normal distribution.
+
+        Args:
+            **kwargs (`dict`, `optional`):
+                Additional keyword arguments, that are passed to the [`ValueHead`] class. These arguments can contain
+                the `v_head_init_strategy` argument as well as the `v_head_initializer_range` argument.
+        """
+        initializer_range = kwargs.pop("v_head_initializer_range", 0.2)
+        # random init by default
+        init_strategy = kwargs.pop("v_head_init_strategy", None)
+        if init_strategy is None:
+            # do nothing
+            pass
+        elif init_strategy == "normal":
+            self.v_head.summary.weight.data.normal_(mean=0.0, std=initializer_range)
+            self.v_head.summary.bias.data.zero_()
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        return_past_key_values=False,
+        **kwargs,
+    ):
+        r"""
+        Applies a forward pass to the wrapped model and returns the logits of the value head.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, `optional`):
+                Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+                (see `past_key_values` input) to speed up sequential decoding.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            return_past_key_values (bool): A flag indicating if the computed hidden-states should be returned.
+            kwargs (`dict`, `optional`):
+                Additional keyword arguments, that are passed to the wrapped model.
+        """
+        kwargs["output_hidden_states"] = (
+            True  # this had already been set in the LORA / PEFT examples
+        )
+        kwargs["past_key_values"] = past_key_values
+
+        if (
+            self.is_peft_model
+            and self.pretrained_model.active_peft_config.peft_type == "PREFIX_TUNING"
+        ):
+            kwargs.pop("past_key_values")
+
+        base_model_output = self.pretrained_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+
+        last_hidden_state = base_model_output.hidden_states[-1]
+        lm_logits = base_model_output.logits
+        loss = base_model_output.loss
+
+        if last_hidden_state.device != self.v_head.summary.weight.device:
+            last_hidden_state = last_hidden_state.to(self.v_head.summary.weight.device)
+
+        value = self.v_head(last_hidden_state).squeeze(-1)
+
+        # force upcast in fp32 if logits are in half-precision
+        if lm_logits.dtype != torch.float32:
+            lm_logits = lm_logits.float()
+
+        if return_past_key_values:
+            return (lm_logits, loss, value, base_model_output.past_key_values)
+        return (lm_logits, loss, value)
+
+    def generate(self, *args, **kwargs):
+        r"""
+        A simple wrapper around the `generate` method of the wrapped model. Please refer to the
+        [`generate`](https://huggingface.co/docs/transformers/internal/generation_utils) method of the wrapped model
+        for more information about the supported arguments.
+
+        Args:
+            *args (`list`, *optional*):
+                Positional arguments passed to the `generate` method of the wrapped model.
+            **kwargs (`dict`, *optional*):
+                Keyword arguments passed to the `generate` method of the wrapped model.
+        """
+        return self.pretrained_model.generate(*args, **kwargs)
+
+    def state_dict(self, *args, **kwargs):
+        r"""
+        Returns the state dictionary of the model. We add the state dictionary of the value head to the state
+        dictionary of the wrapped model by prepending the key with `v_head.`.
+        """
+        if not self.is_peft_model:
+            pretrained_model_state_dict = self.pretrained_model.state_dict(
+                *args, **kwargs
+            )
+        else:
+            # if it is a peft model, only save the v_head
+            pretrained_model_state_dict = {}
+
+        v_head_state_dict = self.v_head.state_dict(*args, **kwargs)
+        for k, v in v_head_state_dict.items():
+            pretrained_model_state_dict[f"v_head.{k}"] = v
+        return pretrained_model_state_dict
+
+    def push_to_hub(self, *args, **kwargs):
+        self.pretrained_model.v_head = self.v_head
+
+        return self.pretrained_model.push_to_hub(*args, **kwargs)
+
+    def post_init(self, state_dict):
+        r"""
+        We add the state dictionary of the value head to the state dictionary of the wrapped model by prepending the
+        key with `v_head.`. This function removes the `v_head.` prefix from the keys of the value head state
+        dictionary.
+        """
+        for k in list(state_dict.keys()):
+            if "v_head." in k:
+                state_dict[k.replace("v_head.", "")] = state_dict.pop(k)
+        self.v_head.load_state_dict(state_dict, strict=False)
+        del state_dict
+
+        if hasattr(self.pretrained_model, "hf_device_map"):
+            if (
+                "cpu" in self.pretrained_model.hf_device_map.values()
+                or "disk" in self.pretrained_model.hf_device_map.values()
+            ):
+                raise ValueError(
+                    "The model is offloaded on CPU or disk - CPU & disk offloading is not supported for ValueHead models."
+                )
+
+            first_device = list(set(self.pretrained_model.hf_device_map.values()))[0]
+            if isinstance(first_device, int):
+                if is_torch_npu_available():
+                    first_device = f"npu:{first_device}"
+                elif is_torch_xpu_available():
+                    first_device = f"xpu:{first_device}"
+                else:
+                    first_device = f"cuda:{first_device}"
+            self.v_head = self.v_head.to(first_device)
+
+            def set_device_hook(module, input, outputs):
+                new_output = ()
+                for output in outputs:
+                    if isinstance(output, torch.Tensor):
+                        new_output += (output.to(first_device),)
+                    else:
+                        new_output += (output,)
+                return new_output
+
+            self.register_forward_hook(set_device_hook)
+
+            self.is_sequential_parallel = True
+
+
+class AutoModelForSeq2SeqLMWithValueHead(PreTrainedModelWrapper):
+    """
+    A seq2seq model with a value head in addition to the language model head. This class inherits from
+    [`PreTrainedModelWrapper`] and wraps a [`~transformers.PreTrainedModel`] class. The wrapper class supports classic
+    functions such as `from_pretrained` and `push_to_hub` and also provides some additional functionalities such as
+    `generate`.
+
+    Args:
+        pretrained_model ([`~transformers.PreTrainedModel`]):
+            The model to wrap. It should be a causal language model such as GPT2. or any model mapped inside the
+            [`~transformers.AutoModelForSeq2SeqLM`] class.
+        kwargs:
+            Additional keyword arguments passed along to the [`ValueHead`] class.
+    """
+
+    transformers_parent_class = AutoModelForSeq2SeqLM
+    lm_head_namings = ["lm_head", "embed_out", "output_projection"]
+    supported_args = (
+        "summary_dropout_prob",
+        "v_head_initializer_range",
+        "v_head_init_strategy",
+    )
+
+    def __init__(self, pretrained_model, **kwargs):
+        super().__init__(pretrained_model, **kwargs)
+        v_head_kwargs, _, _ = self._split_kwargs(kwargs)
+        self.is_encoder_decoder = True
+
+        if not self._has_lm_head():
+            raise ValueError(
+                "The model does not have a language model head, please use a model that has one."
+            )
+
+        self.v_head = ValueHead(self.pretrained_model.config, **v_head_kwargs)
+
+        self._init_weights(**v_head_kwargs)
+
+    def _has_lm_head(self):
+        # check module names of all modules inside `pretrained_model` to find the language model head
+        for name, _module in self.pretrained_model.named_modules():
+            if any(attribute in name for attribute in self.lm_head_namings):
+                return True
+        return False
+
+    def post_init(self, state_dict):
+        r"""
+        We add the state dictionary of the value head to the state dictionary of the wrapped model by prepending the
+        key with `v_head.`. This function removes the `v_head.` prefix from the keys of the value head state
+        dictionary.
+        """
+        for k in list(state_dict.keys()):
+            if "v_head." in k:
+                state_dict[k.replace("v_head.", "")] = state_dict.pop(k)
+        self.v_head.load_state_dict(state_dict, strict=False)
+        del state_dict
+
+        if hasattr(self.pretrained_model, "hf_device_map"):
+            if (
+                "cpu" in self.pretrained_model.hf_device_map.values()
+                or "disk" in self.pretrained_model.hf_device_map.values()
+            ):
+                raise ValueError(
+                    "The model is offloaded on CPU or disk - CPU & disk offloading is not supported for ValueHead models."
+                )
+
+            # get the lm_head device
+            for name, module in self.pretrained_model.named_modules():
+                if any(attribute in name for attribute in self.lm_head_namings):
+                    lm_head_device = module.weight.device
+                    break
+
+            # put v_head on the same device as the lm_head to avoid issues
+            self.v_head = self.v_head.to(lm_head_device)
+
+            def set_device_hook(module, input, outputs):
+                r"""
+                A hook that sets the device of the output of the model to the device of the first parameter of the
+                model.
+
+                Args:
+                    module (`nn.Module`):
+                        The module to which the hook is attached.
+                    input (`tuple`):
+                        The input to the module.
+                    outputs (`tuple`):
+                        The output of the module.
+                """
+                new_output = ()
+                for output in outputs:
+                    if isinstance(output, torch.Tensor):
+                        new_output += (output.to(lm_head_device),)
+                    else:
+                        new_output += (output,)
+                return new_output
+
+            self.register_forward_hook(set_device_hook)
+            self.is_sequential_parallel = True
+
+    def state_dict(self, *args, **kwargs):
+        r"""
+        Returns the state dictionary of the model. We add the state dictionary of the value head to the state
+        dictionary of the wrapped model by prepending the key with `v_head.`.
+        """
+        if not self.is_peft_model:
+            pretrained_model_state_dict = self.pretrained_model.state_dict(
+                *args, **kwargs
+            )
+        else:
+            # if it is a peft model, only save the v_head
+            pretrained_model_state_dict = {}
+
+        v_head_state_dict = self.v_head.state_dict(*args, **kwargs)
+        for k, v in v_head_state_dict.items():
+            pretrained_model_state_dict[f"v_head.{k}"] = v
+        return pretrained_model_state_dict
+
+    def push_to_hub(self, *args, **kwargs):
+        self.pretrained_model.v_head = self.v_head
+
+        return self.pretrained_model.push_to_hub(*args, **kwargs)
+
+    def _init_weights(self, **kwargs):
+        r"""
+        We initialize the weights of the value head.
+        """
+        initializer_range = kwargs.pop("v_head_initializer_range", 0.2)
+        # random init by default
+        init_strategy = kwargs.pop("v_head_init_strategy", None)
+        if init_strategy is None:
+            # do nothing
+            pass
+        elif init_strategy == "normal":
+            self.v_head.summary.weight.data.normal_(mean=0.0, std=initializer_range)
+            self.v_head.summary.bias.data.zero_()
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        return_past_key_values=False,
+        **kwargs,
+    ):
+        kwargs["past_key_values"] = past_key_values
+        if (
+            self.is_peft_model
+            and self.pretrained_model.active_peft_config.peft_type == "PREFIX_TUNING"
+        ):
+            kwargs.pop("past_key_values")
+
+        base_model_output = self.pretrained_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,  # We force the model to output hidden states
+            **kwargs,
+        )
+
+        last_hidden_state = base_model_output.decoder_hidden_states[-1]
+        lm_logits = base_model_output.logits
+        loss = base_model_output.loss
+
+        value = self.v_head(last_hidden_state).squeeze(-1)
+
+        # force upcast in fp32 if logits are in half-precision
+        if lm_logits.dtype != torch.float32:
+            lm_logits = lm_logits.float()
+
+        if return_past_key_values:
+            return (lm_logits, loss, value, base_model_output.past_key_values)
+        return (lm_logits, loss, value)
+
+    def generate(self, *args, **kwargs):
+        r"""
+        We call `generate` on the wrapped model.
+        """
+        return self.pretrained_model.generate(*args, **kwargs)
diff --git a/src/aixpert/training/training/trl/models/utils.py b/src/aixpert/training/training/trl/models/utils.py
new file mode 100644
index 0000000..5247d31
--- /dev/null
+++ b/src/aixpert/training/training/trl/models/utils.py
@@ -0,0 +1,678 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import itertools
+import warnings
+from collections.abc import Callable
+from contextlib import contextmanager
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Literal
+
+import torch
+from accelerate.utils import is_peft_model
+from packaging import version
+from torch import nn
+from transformers import (
+    AddedToken,
+    AutoTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    TrainingArguments,
+)
+from transformers.utils import is_peft_available
+
+from .modeling_value_head import (
+    AutoModelForCausalLMWithValueHead,
+    AutoModelForSeq2SeqLMWithValueHead,
+)
+
+
+if is_peft_available():
+    import peft
+    from peft import PeftConfig, PeftModel, get_peft_model
+
+
+if TYPE_CHECKING:
+    from accelerate import Accelerator
+    from deepspeed.runtime.engine import DeepSpeedEngine
+    from torch.nn import Module
+    from torch.nn.parallel.distributed import DistributedDataParallel
+
+
+SUPPORTED_ARCHITECTURES = (
+    AutoModelForCausalLMWithValueHead,
+    AutoModelForSeq2SeqLMWithValueHead,
+)
+
+
+# TODO: Add Abstract Base Class if more formats are added
+@dataclass
+class ChatMlSpecialTokens:
+    """Dataclass for special tokens used in ChatML, including system, user, assistant, bos, eos, and pad tokens."""
+
+    bos_token: str = "<|im_start|>"
+    eos_token: str = "<|im_end|>"
+    pad_token: str = "<|im_end|>"
+
+    @property
+    def system(self):
+        return f"{self.bos_token}system"
+
+    @property
+    def user(self):
+        return f"{self.bos_token}user"
+
+    @property
+    def assistant(self):
+        return f"{self.bos_token}assistant"
+
+    @property
+    def chat_template(self):
+        return (
+            "{% for message in messages %}"
+            f"{{{{'{self.bos_token}' + message['role'] + '\n' + message['content'] + '{self.eos_token}' + '\n'}}}}"
+            "{% endfor %}"
+            "{% if add_generation_prompt %}"
+            f"{{{{ '{self.assistant}\n' }}}}"
+            "{% endif %}"
+        )
+
+
+FORMAT_MAPPING = {"chatml": ChatMlSpecialTokens}
+
+
+def setup_chat_format(
+    model: PreTrainedModel,
+    tokenizer: PreTrainedTokenizer,
+    format: Literal["chatml"] | None = "chatml",
+    resize_to_multiple_of: int | None = None,
+) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
+    # docstyle-ignore
+    """
+    Setup chat format by adding special tokens to the tokenizer, setting the correct format, and extending the
+    embedding layer of the model based on the new special tokens.
+
+    > [!WARNING]
+    > This function is deprecated and will be removed in version 0.26.0. Please use [`clone_chat_template`] instead.
+
+    If the model already has a chat template, this will throw an error. If you want to overwrite it, please set
+    `tokenizer.chat_template` to `None`.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): The model to be modified.
+        tokenizer ([`~transformers.PreTrainedTokenizer`]): The tokenizer to be modified.
+        format (`Literal["chatml"] | None`): The format to be set. Defaults to "chatml".
+        resize_to_multiple_of (`int` or `None`): Number to resize the embedding layer to. Defaults to None.
+
+    Returns
+    -------
+        model ([`~transformers.PreTrainedModel`]):
+            The modified model.
+        tokenizer ([`~transformers.PreTrainedTokenizer`]):
+            The modified tokenizer.
+    """
+    warnings.warn(
+        "The `setup_chat_format` function is deprecated and will be removed in version 0.26.0. Please use "
+        "`clone_chat_template` instead.",
+        FutureWarning,
+    )
+    # check if model already had a chat template
+    if tokenizer.chat_template is not None:
+        raise ValueError(
+            "Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None"
+        )
+
+    # check if format available and retrieve
+    if format not in FORMAT_MAPPING:
+        raise ValueError(
+            f"Format {format} not available. Please use one of {FORMAT_MAPPING.keys()}"
+        )
+
+    chat_format = FORMAT_MAPPING[format]()
+
+    # set special tokens and them
+    tokenizer.eos_token = chat_format.eos_token
+    tokenizer.pad_token = chat_format.pad_token
+    tokenizer.bos_token = chat_format.bos_token
+    tokenizer.add_special_tokens(
+        {"additional_special_tokens": [chat_format.bos_token, chat_format.eos_token]}
+    )
+    # set chat format for tokenizer
+    tokenizer.chat_template = chat_format.chat_template
+
+    # resize embedding layer to a multiple of 64, https://x.com/karpathy/status/1621578354024677377
+    model.resize_token_embeddings(
+        # After studying many tokenizers, we found that len(tokenizer.vocab) is the most reliable way to get the vocab
+        # size. Avoid using tokenizer.vocab_size or tokenizer.vocab_size + len(tokenizer.added_tokens_encoder),
+        # as handling of special and added tokens varies across tokenizers.
+        new_num_tokens=len(tokenizer.vocab),
+        pad_to_multiple_of=resize_to_multiple_of
+        if resize_to_multiple_of is not None
+        else None,
+    )
+    # Update the model config to use the new eos & bos tokens
+    if getattr(model, "config", None) is not None:
+        model.config.pad_token_id = tokenizer.pad_token_id
+        model.config.bos_token_id = tokenizer.bos_token_id
+        model.config.eos_token_id = tokenizer.eos_token_id
+    # Update the generation config to use the new eos & bos token
+    if getattr(model, "generation_config", None) is not None:
+        model.generation_config.bos_token_id = tokenizer.bos_token_id
+        model.generation_config.eos_token_id = tokenizer.eos_token_id
+        model.generation_config.pad_token_id = tokenizer.pad_token_id
+
+    return model, tokenizer
+
+
+def clone_chat_template(
+    model: PreTrainedModel,
+    tokenizer: PreTrainedTokenizer,
+    source_tokenizer_path: str,
+    resize_to_multiple_of: int | None = 64,
+) -> tuple[PreTrainedModel, PreTrainedTokenizer, list[int]]:
+    """
+    Clones a chat template from a source tokenizer to the target tokenizer and updates the model accordingly.
+
+    This function:
+    - Copies the chat template from a source tokenizer to the target tokenizer.
+    - Adds any new tokens from the source tokenizer to the target tokenizer.
+    - Sets and synchronizes the EOS token across the tokenizer and model.
+    - Resizes the model's token embeddings to match the new vocabulary size, optionally rounding it up to a multiple of
+      a specified value. In such cases, dummy tokens are added to the tokenizer to ensure the vocabulary size matches
+      the embedding dimensions.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            Model to update.
+        tokenizer ([`~transformers.PreTrainedTokenizer`]):
+            Tokenizer to update.
+        source_tokenizer_path (`str`):
+            Path or identifier of the pretrained tokenizer to clone from.
+        resize_to_multiple_of (`int` or `None`, *optional*, defaults to `64`):
+            The embedding layer will be resized to the new vocabulary size. If this is not `None`, it will round up the
+            new vocabulary size to the nearest multiple of this value.
+
+    Returns
+    -------
+        model ([`~transformers.PreTrainedModel`]):
+            Updated model with resized token embeddings and EOS token configured.
+        tokenizer ([`~transformers.PreTrainedTokenizer`]):
+            Updated tokenizer with the chat template and special tokens applied.
+        added_tokens (`list[int]`):
+            List of tokens that were added to the tokenizer from the source tokenizer.
+
+    Example:
+    ```python
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from trl import clone_chat_template
+
+    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
+    model, tokenizer, added_tokens = clone_chat_template(
+        model, tokenizer, "Qwen/Qwen3-0.6B"
+    )
+    ```
+    """
+    # Load the source tokenizer containing the desired chat template
+    tokenizer_source = AutoTokenizer.from_pretrained(source_tokenizer_path)
+
+    # Copy the chat template from the source tokenizer
+    tokenizer.chat_template = tokenizer_source.get_chat_template()
+
+    # Ensure all added tokens from the source are available in the target tokenizer
+    added_tokens = [
+        token
+        for token in tokenizer_source.added_tokens_decoder.values()
+        if token.content not in tokenizer.vocab
+    ]
+    tokenizer.add_tokens(added_tokens)
+
+    # Set the EOS token from the source tokenizer (important for generation)
+    tokenizer.eos_token = tokenizer_source.eos_token
+    model.config.eos_token_id = tokenizer.eos_token_id
+    if (
+        model.generation_config is not None
+    ):  # for SequenceClassification models, generation_config is None
+        model.generation_config.eos_token_id = tokenizer.eos_token_id
+
+    # Resize model embeddings to include any new tokens, optionally rounding up to a multiple
+    model.resize_token_embeddings(
+        # After studying many tokenizers, we found that len(tokenizer.vocab) is the most reliable way to get the vocab
+        # size. Avoid using tokenizer.vocab_size or tokenizer.vocab_size + len(tokenizer.added_tokens_encoder),
+        # as handling of special and added tokens varies across tokenizers.
+        new_num_tokens=len(tokenizer.vocab),
+        pad_to_multiple_of=resize_to_multiple_of
+        if resize_to_multiple_of is not None
+        else None,
+    )
+
+    # After resizing, the embedding matrix size may exceed the vocabulary size. Add dummy tokens to the tokenizer to
+    # ensure vocabulary size matches the embedding matrix dimensions.
+    idx = 0
+    while model.vocab_size > len(tokenizer.vocab):
+        dummy_token = AddedToken(f"<extra_id_{idx}>")
+        is_added = tokenizer.add_tokens(dummy_token)
+        idx += 1
+        if is_added == 1:
+            added_tokens.append(dummy_token)
+
+    # Verify that vocabulary size now matches embedding dimensions
+    if len(tokenizer.vocab) != model.vocab_size:
+        raise RuntimeError(
+            f"Vocabulary size mismatch after resizing: tokenizer vocab size is {len(tokenizer.vocab)}, but model "
+            f"embedding size is {model.vocab_size}. This indicates an internal error in the token alignment process."
+        )
+    added_tokens = [token.content for token in added_tokens]
+    added_tokens = tokenizer.convert_tokens_to_ids(added_tokens)
+    return model, tokenizer, added_tokens
+
+
+def remove_hooks(model: "DeepSpeedEngine") -> None:
+    """Removes the optimizer hooks from a DeepSpeed ZeRO-3 model."""
+    if not hasattr(
+        model, "optimizer"
+    ):  # before the first training step, the model has no optimizer
+        return
+    if model.optimizer is not None and hasattr(model.optimizer, "parameter_offload"):
+        optimizer_offload = model.optimizer.parameter_offload
+    elif model.optimizer is not None:
+        optimizer_offload = model.optimizer
+    else:
+        raise RuntimeError("The model optimizer is None, which is not yet supported.")
+
+    for param in iter_params(optimizer_offload.module, recurse=True):
+        param.ds_active_sub_modules.clear()
+
+    for hook in optimizer_offload.forward_hooks:
+        hook.remove()
+    for hook in optimizer_offload.backward_hooks:
+        hook.remove()
+
+    optimizer_offload.forward_hooks = []
+    optimizer_offload.backward_hooks = []
+
+
+def get_all_parameters(sub_module, recurse=False):
+    return itertools.chain(
+        sub_module.named_parameters(recurse=recurse),
+        sub_module.ds_external_parameters(),
+    )
+
+
+def iter_params(module, recurse=False):
+    return [param for _, param in get_all_parameters(module, recurse)]
+
+
+def add_hooks(model: "DeepSpeedEngine") -> None:
+    """Adds the optimizer hooks from a DeepSpeed ZeRO-3 model."""
+    import deepspeed
+
+    if not hasattr(
+        model, "optimizer"
+    ):  # before the first training step, the model has no optimizer
+        return
+    if model.optimizer is not None and hasattr(model.optimizer, "parameter_offload"):
+        optimizer_offload = model.optimizer.parameter_offload
+    elif model.optimizer is not None:
+        optimizer_offload = model.optimizer
+    else:
+        raise RuntimeError("The model optimizer is None, which is not yet supported.")
+    if version.parse(deepspeed.__version__) >= version.parse("0.16.4"):
+        # Account for renaming in https://github.com/deepspeedai/DeepSpeed/pull/6847
+        optimizer_offload._register_deepspeed_module(optimizer_offload.module)
+    else:
+        optimizer_offload._register_hooks_recursively(optimizer_offload.module)
+
+
+@contextmanager
+def unwrap_model_for_generation(
+    model: "DistributedDataParallel | DeepSpeedEngine",
+    accelerator: "Accelerator",
+    gather_deepspeed3_params: bool = True,
+):
+    """
+    Context manager to unwrap distributed or accelerated models for generation tasks.
+
+    Args:
+        model (`DistributedDataParallel | DeepSpeedEngine`):
+            Model to be unwrapped.
+        accelerator ([`~accelerate.Accelerator`]):
+            Accelerator instance managing the model.
+        gather_deepspeed3_params (`bool`, *optional*, defaults to `True`):
+            Whether to gather weights for DeepSpeed ZeRO Stage 3 models. If `False`, skips parameter gathering, which
+            can be more memory-efficient but may lead to slower generation times.
+
+    Yields
+    ------
+        Unwrapped model.
+
+    Example:
+    ```python
+    with unwrap_model_for_generation(model, accelerator) as unwrapped_model:
+        generated_outputs = unwrapped_model.generate(input_ids)
+    ```
+    """
+    unwrapped_model = accelerator.unwrap_model(model)
+    is_gradient_checkpointing = unwrapped_model.is_gradient_checkpointing
+    if is_gradient_checkpointing:
+        unwrapped_model.gradient_checkpointing_disable()
+    if (
+        accelerator.state.deepspeed_plugin is not None
+        and accelerator.state.deepspeed_plugin.zero_stage == 3
+    ):
+        if not gather_deepspeed3_params:
+            yield accelerator.unwrap_model(model)
+        else:
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(model.parameters()):
+                remove_hooks(model)
+                yield accelerator.unwrap_model(model)
+                add_hooks(model)
+    else:
+        yield unwrapped_model
+    if is_gradient_checkpointing:
+        unwrapped_model.gradient_checkpointing_enable()
+
+
+def prepare_deepspeed(model: "Module", accelerator: "Accelerator"):
+    """Prepares the model for DeepSpeed inference or evaluation by initializing it with the appropriate configuration.
+
+    Adapted from accelerate:
+    https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
+    """
+    import deepspeed  # local import (instead of top-level) to avoid DS init interfering with other backends (like vllm): https://github.com/deepspeedai/DeepSpeed/issues/7252
+
+    deepspeed_plugin = accelerator.state.deepspeed_plugin
+    config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
+    stage = config_kwargs["zero_optimization"]["stage"]
+
+    if model is not None:
+        hidden_size = (
+            max(model.config.hidden_sizes)
+            if getattr(model.config, "hidden_sizes", None)
+            else getattr(model.config, "hidden_size", None)
+        )
+        if hidden_size is not None and stage == 3:
+            # Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache
+            # @ step 0: expected module 1, but got module 0`
+            # This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
+            config_kwargs.update(
+                {
+                    "zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
+                    "zero_optimization.stage3_param_persistence_threshold": 10
+                    * hidden_size,
+                    "zero_optimization.stage3_prefetch_bucket_size": 0.9
+                    * hidden_size
+                    * hidden_size,
+                }
+            )
+
+    # If ZeRO-3 is used, we shard both the active and reference model.
+    # Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO
+    # disabled (stage 0)
+    if stage != 3:
+        config_kwargs["zero_optimization"]["stage"] = 0
+    model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
+    model.eval()
+    return model
+
+
+def prepare_fsdp(model, accelerator):
+    # Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1421
+    from torch.distributed.fsdp import FSDPModule
+    from torch.distributed.fsdp.fully_sharded_data_parallel import (
+        FullyShardedDataParallel as FSDP,
+    )
+
+    # Check if the model is already a FSDP model due to `Manual Wrapping` and if so,
+    # don't wrap it again
+    if not (isinstance(model, FSDP) or isinstance(model, FSDPModule)):
+        accelerator.state.fsdp_plugin.set_auto_wrap_policy(model)
+        fsdp_plugin = accelerator.state.fsdp_plugin
+        kwargs = {
+            "sharding_strategy": fsdp_plugin.sharding_strategy
+            or fsdp_plugin.reshard_after_forward,
+            "cpu_offload": fsdp_plugin.cpu_offload,
+            "auto_wrap_policy": fsdp_plugin.auto_wrap_policy,
+            "mixed_precision": fsdp_plugin.mixed_precision_policy,
+            "sync_module_states": fsdp_plugin.sync_module_states,
+            "backward_prefetch": fsdp_plugin.backward_prefetch,
+            "forward_prefetch": fsdp_plugin.forward_prefetch,
+            "use_orig_params": fsdp_plugin.use_orig_params,
+            "param_init_fn": fsdp_plugin.param_init_fn,
+            "ignored_modules": fsdp_plugin.ignored_modules,
+            "limit_all_gathers": fsdp_plugin.limit_all_gathers,
+            "device_id": accelerator.device,
+        }
+        model = FSDP(model, **kwargs)
+    model.eval()
+    return model
+
+
+class _ForwardRedirection:
+    """Implements the `forward-redirection`.
+
+    Taken from Pytorch-lightning:
+    https://github.com/Lightning-AI/pytorch-lightning/blob/02311d03fb982560246eead7c08104481fac9579/src/lightning/pytorch/strategies/strategy.py#L602
+
+    A method call to a wrapped module gets rerouted through the wrapper's `forward` method instead.
+
+    """
+
+    def __call__(
+        self,
+        wrapper_module: nn.Module,
+        original_module: nn.Module,
+        method: Callable,
+        *args: Any,
+        **kwargs: Any,
+    ):
+        """Reroutes a method call through the `wrapper_module`'s `forward` method.
+
+        Args:
+            wrapper_module: The module that has `original_module` wrapped.
+            original_module: The module that was wrapped inside `wrapper_module`.
+            method: The method that should be called on the `original_module` after inputs get
+                redirected through the `wrapper_module`'s `forward` method.
+            *args: The positional arguments to the `method`. They will get passed to a patched
+                `forward` method instead.
+            **kwargs: The keyword arguments to the `method`. They will get passed to a patched
+                `forward` method instead.
+
+        """
+        original_forward = original_module.forward
+
+        def wrapped_forward(*_args: Any, **_kwargs: Any) -> Any:
+            # Unpatch ourselves immediately before calling the method `method_name`
+            # because itself may want to call the real `forward`
+            original_module.forward = original_forward  # type: ignore[method-assign]
+            # Call the actual method e.g. `.training_step(...)`
+            out = method(*_args, **_kwargs)
+            self.on_after_inner_forward(wrapper_module, original_module)
+            return out
+
+        # Patch the original_module's forward so we can redirect the arguments back to the real method
+        original_module.forward = wrapped_forward  # type: ignore[method-assign]
+
+        wrapper_output = wrapper_module(*args, **kwargs)
+        self.on_after_outer_forward(wrapper_module, original_module)
+        return wrapper_output
+
+    def on_after_inner_forward(
+        self, wrapper_module: nn.Module, original_module: nn.Module
+    ) -> None:
+        pass
+
+    def on_after_outer_forward(
+        self, wrapper_module: nn.Module, original_module: nn.Module
+    ) -> None:
+        pass
+
+
+def prepare_model_for_kbit_training(
+    model, use_gradient_checkpointing=True, gradient_checkpointing_kwargs=None
+):
+    r"""
+    Prepare a k-bit quantized transformers model for training (PEFT/QLoRA).
+    """
+    loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(
+        model, "is_loaded_in_4bit", False
+    )
+    quant_methods = ["gptq", "aqlm", "eetq", "torchao", "hqq"]
+    is_quantized = getattr(
+        model, "quantization_method", None
+    ) in quant_methods or getattr(model, "hqq_quantized", False)
+
+    if gradient_checkpointing_kwargs is None:
+        gradient_checkpointing_kwargs = {}
+
+    n_upcasted = 0
+    for name, param in model.named_parameters():
+        # freeze all parameters
+        param.requires_grad = False
+
+        # upcast LayerNorm / Norm to float32 for numerical stability
+        if (param.dtype in [torch.float16, torch.bfloat16]) and (
+            "norm" in name.lower() or "layernorm" in name.lower()
+        ):
+            param.data = param.data.to(torch.float32)
+            n_upcasted += 1
+
+    # Enable gradient checkpointing if needed
+    if (loaded_in_kbit or is_quantized) and use_gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            # backward-compatible hook
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+        supports_gc_kwargs = "gradient_checkpointing_kwargs" in list(
+            inspect.signature(model.gradient_checkpointing_enable).parameters
+        )
+        gc_kwargs = (
+            {"gradient_checkpointing_kwargs": gradient_checkpointing_kwargs}
+            if supports_gc_kwargs
+            else {}
+        )
+        model.gradient_checkpointing_enable(**gc_kwargs)
+
+    return model
+
+
+def enable_gradient_checkpointing(
+    model: PreTrainedModel, gradient_checkpointing_kwargs: dict | None
+) -> PreTrainedModel:
+    """Enables gradient checkpointing for the model."""
+    # Enable gradient checkpointing on the base model for PEFT
+    if is_peft_model(model):
+        model.base_model.gradient_checkpointing_enable()
+    # Enable gradient checkpointing for non-PEFT models
+    else:
+        model.gradient_checkpointing_enable()
+
+    gradient_checkpointing_kwargs = gradient_checkpointing_kwargs or {}
+    use_reentrant = (
+        "use_reentrant" not in gradient_checkpointing_kwargs
+        or gradient_checkpointing_kwargs["use_reentrant"]
+    )
+
+    if use_reentrant:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    return model
+
+
+def peft_module_casting_to_bf16(model):
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.LayerNorm) or "norm" in name:
+            module = module.to(torch.float32)
+        elif any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]):
+            if hasattr(module, "weight"):
+                if module.weight.dtype == torch.float32:
+                    module = module.to(torch.bfloat16)
+
+
+def prepare_peft_model(
+    model: PreTrainedModel, peft_config: "PeftConfig | None", args: TrainingArguments
+) -> PreTrainedModel:
+    """Prepares a model for PEFT training."""
+    if not is_peft_available():
+        raise ImportError(
+            "PEFT is required to use a peft model. Run `pip install peft`."
+        )
+
+    # If the model is already a PeftModel, we need to merge and unload it.
+    # Further information here: https://huggingface.co/docs/trl/dpo_trainer#reference-model-considerations-with-peft
+    if isinstance(model, PeftModel) and peft_config is not None:
+        model = model.merge_and_unload()
+
+    # Handle quantized models (QLoRA)
+    is_qlora = getattr(model, "is_loaded_in_4bit", False) or getattr(
+        model, "is_loaded_in_8bit", False
+    )
+
+    is_sharded_qlora = False
+    if getattr(model, "is_loaded_in_4bit", False):
+        # Check if model is sharded (FSDP/DS-Zero3)
+        for _, param in model.named_parameters():
+            if param.__class__.__name__ == "Params4bit":
+                is_sharded_qlora = param.data.device.type in {"cpu", "meta"}
+                break
+
+    # Prepare model for kbit training if needed
+    if is_qlora and not is_sharded_qlora and not isinstance(model, PeftModel):
+        model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing=args.gradient_checkpointing,
+            gradient_checkpointing_kwargs=args.gradient_checkpointing_kwargs or {},
+        )
+        # Disable gradient checkpointing as it's handled by prepare_model_for_kbit_training
+        args.gradient_checkpointing = False
+    elif args.gradient_checkpointing:
+        model = enable_gradient_checkpointing(model, args.gradient_checkpointing_kwargs)
+
+    # Create PEFT model
+    if peft_config is not None:
+        if (
+            version.parse(peft.__version__)
+            >= version.parse("0.12")  # autocast_adapter_dtype introduced in 0.12
+            and getattr(model, "is_loaded_in_4bit", False)
+            and is_sharded_qlora
+        ):
+            model = get_peft_model(model, peft_config, autocast_adapter_dtype=False)
+        else:
+            model = get_peft_model(model, peft_config)
+
+    # Handle bf16 casting for 4-bit models
+    if (
+        args.bf16
+        and getattr(model, "is_loaded_in_4bit", False)
+        and not is_sharded_qlora
+    ):
+        peft_module_casting_to_bf16(model)
+
+    return model
diff --git a/src/aixpert/training/training/trl/py.typed b/src/aixpert/training/training/trl/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/src/aixpert/training/training/trl/rewards/__init__.py b/src/aixpert/training/training/trl/rewards/__init__.py
new file mode 100644
index 0000000..0b5666b
--- /dev/null
+++ b/src/aixpert/training/training/trl/rewards/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from typing import TYPE_CHECKING
+
+from ..import_utils import _LazyModule
+
+
+_import_structure = {
+    "accuracy_rewards": ["accuracy_reward"],
+    "format_rewards": ["think_format_reward"],
+    "other_rewards": ["get_soft_overlong_punishment"],
+}
+
+
+if TYPE_CHECKING:
+    from .accuracy_rewards import accuracy_reward
+    from .format_rewards import think_format_reward
+    from .other_rewards import get_soft_overlong_punishment
+
+
+else:
+    sys.modules[__name__] = _LazyModule(
+        __name__, __file__, _import_structure, module_spec=__spec__
+    )
diff --git a/src/aixpert/training/training/trl/rewards/accuracy_rewards.py b/src/aixpert/training/training/trl/rewards/accuracy_rewards.py
new file mode 100644
index 0000000..9be6860
--- /dev/null
+++ b/src/aixpert/training/training/trl/rewards/accuracy_rewards.py
@@ -0,0 +1,95 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..import_utils import is_math_verify_available
+
+
+if is_math_verify_available():
+    from latex2sympy2_extended import NormalizationConfig
+    from math_verify import LatexExtractionConfig, parse, verify
+
+
+def accuracy_reward(
+    completions: list[list[dict[str, str]]], solution: list[str], **kwargs
+) -> list[float | None]:
+    r"""
+    Reward function that checks if the completion is the same as the ground truth.
+        - If both gold and prediction are parseable → use math verification.
+        - If not parseable → compare as normalized text.
+
+    Args:
+        completions (`list[list[dict[str, str]]]`):
+            List of completions to be evaluated. Each completion must be a list of one message, i.e. a dictionary
+            containing the key `"content"` with the value being the text of the completion.
+        solution: (`list[str]`):
+            List of the raw-text solutions to the questions/problems/prompts.
+        **kwargs:
+            Additional keyword arguments. This function does not use them, but they are required in the function
+            signature to ensure compatibility with trainers like [`GRPOTrainer`].
+    Example:
+    ```python
+    >>> from trl.rewards import accuracy_reward
+
+    >>> solution = [r"\frac{1}{3}", r"\frac{1}{3}"]
+    >>> completion = [
+    ...     [{"role": "assistant", "content": r"My answer is \boxed{\frac{1}{3}}"}],
+    ...     [{"role": "assistant", "content": r"My answer is \boxed{\frac{1}{2}}"}],
+    ... ]
+    >>> accuracy_reward(completion, solution)
+    [1.0, 0.0]
+    ```
+    """
+    if not is_math_verify_available():
+        raise ImportError(
+            "Please install the `math_verify` package to use accuracy_reward"
+        )
+
+    contents = [completion[0]["content"] for completion in completions]
+    rewards = []
+    for content, sol in zip(contents, solution, strict=True):
+        gold_parsed = parse(
+            sol,
+            extraction_mode="first_match",
+        )
+        if len(gold_parsed) != 0:
+            # We require the answer to be provided in correct latex (no malformed operators)
+            answer_parsed = parse(
+                content,
+                extraction_config=[
+                    LatexExtractionConfig(
+                        normalization_config=NormalizationConfig(
+                            nits=False,
+                            malformed_operators=False,
+                            basic_latex=True,
+                            boxed="all",
+                            units=True,
+                        ),
+                        # Ensures that boxed is tried first
+                        boxed_match_priority=0,
+                        try_extract_without_anchor=False,
+                    )
+                ],
+                extraction_mode="first_match",
+            )
+            # Compute binary rewards if verifiable, `None` otherwise to skip this example
+            try:
+                reward = float(verify(gold_parsed, answer_parsed))
+            except Exception:
+                reward = None
+        else:
+            # If the gold solution is not parseable, we assign `None` to skip this example
+            reward = float(content.strip().lower() == sol.strip().lower())
+        rewards.append(reward)
+
+    return rewards
diff --git a/src/aixpert/training/training/trl/rewards/format_rewards.py b/src/aixpert/training/training/trl/rewards/format_rewards.py
new file mode 100644
index 0000000..8ce25bd
--- /dev/null
+++ b/src/aixpert/training/training/trl/rewards/format_rewards.py
@@ -0,0 +1,56 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+
+def think_format_reward(
+    completions: list[list[dict[str, str]]], **kwargs
+) -> list[float]:
+    r"""
+    Reward function that checks if the reasoning process is enclosed within `"<think>"` and `"</think>"` tags. The
+    function returns a reward of 1.0 if the format is correct, otherwise 0.0.
+
+    Args:
+        completions (`list[list[dict[str, str]]]`):
+            List of completions to be evaluated. Each completion must be a list of one message, i.e. a dictionary
+            containing the key `"content"` with the value being the text of the completion.
+        **kwargs:
+            Additional keyword arguments. This function does not use them, but they are required in the function
+            signature to ensure compatibility with trainers like [`GRPOTrainer`].
+
+    Returns
+    -------
+        `list[float]`:
+            A list of rewards, where each reward is 1.0 if the completion matches the expected format, otherwise 0.0.
+
+    Example:
+    ```python
+    >>> from trl.rewards import think_format_reward
+
+    >>> completions = [
+    ...     [{"content": "<think>\nThis is my reasoning.\n</think>\nThis is my answer."}],
+    ...     [{"content": "<think>\nThis is my reasoning.\nThis is my answer."}],
+    ... ]
+    >>> think_format_reward(completions)
+    [1.0, 0.0]
+    ```
+    """
+    pattern = r"^<think>(?!.*<think>)(.*?)</think>.*$"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [
+        re.match(pattern, content, re.DOTALL | re.MULTILINE)
+        for content in completion_contents
+    ]
+    return [1.0 if match else 0.0 for match in matches]
diff --git a/src/aixpert/training/training/trl/rewards/other_rewards.py b/src/aixpert/training/training/trl/rewards/other_rewards.py
new file mode 100644
index 0000000..fd9b6c5
--- /dev/null
+++ b/src/aixpert/training/training/trl/rewards/other_rewards.py
@@ -0,0 +1,73 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+
+
+def get_soft_overlong_punishment(
+    max_completion_len: int, soft_punish_cache: int
+) -> Callable:
+    # docstyle-ignore
+    r"""
+    Reward function that penalizes overlong completions. It is used to penalize overlong completions, but not to reward
+    shorter completions. Reference: Eq. (13) from the DAPO paper (https://huggingface.co/papers/2503.14476)
+
+    $$
+    R_{\text{length}}(y) = \begin{cases}
+    0, & |y| \le L_{\max} - L_{\text{cache}} \\
+    \dfrac{(L_{\max} - L_{\text{cache}}) - |y|}{L_{\text{cache}}}, & L_{\max} - L_{\text{cache}} < |y| \le L_{\max} \\
+    -1, & L_{\max} < |y|
+    \end{cases}
+    $$
+
+    Args:
+        max_completion_len (`int`):
+            Maximum length of the completion,  \( L_{\max} \).
+        soft_punish_cache (`int`):
+            Minimum length of the completion,  \( L_{\text{cache}} \). If set to `0`, no minimum length is applied.
+
+    Example:
+    ```python
+    from trl.rewards import get_soft_overlong_punishment
+
+    soft_overlong_punishment = get_soft_overlong_punishment(max_completion_len=100, soft_punish_cache=20)
+    completion_ids = [[1] * 90]  # simulating a completion with 90 tokens. 90 is between 80 and 100.
+    rewards = soft_overlong_punishment(completion_ids)
+    print(rewards)  # [-0.5]
+    ```
+    """
+
+    def soft_overlong_punishment_reward(
+        completion_ids: list[list[int]], **kwargs
+    ) -> list[float]:
+        """Reward function that penalizes overlong completions."""
+        rewards = []
+        for ids in completion_ids:
+            completion_length = len(ids)
+            if completion_length <= max_completion_len - soft_punish_cache:
+                rewards.append(0.0)
+            elif (
+                max_completion_len - soft_punish_cache
+                < completion_length
+                <= max_completion_len
+            ):
+                rewards.append(
+                    (max_completion_len - soft_punish_cache - completion_length)
+                    / soft_punish_cache
+                )
+            else:
+                rewards.append(-1.0)
+        return rewards
+
+    return soft_overlong_punishment_reward
diff --git a/src/aixpert/training/training/trl/scripts/__init__.py b/src/aixpert/training/training/trl/scripts/__init__.py
new file mode 100644
index 0000000..30bc3e5
--- /dev/null
+++ b/src/aixpert/training/training/trl/scripts/__init__.py
@@ -0,0 +1,43 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ..import_utils import _LazyModule
+
+
+_import_structure = {
+    "utils": [
+        "DatasetMixtureConfig",
+        "ScriptArguments",
+        "TrlParser",
+        "get_dataset",
+        "init_zero_verbose",
+    ],
+}
+
+if TYPE_CHECKING:
+    from .utils import (
+        DatasetMixtureConfig,
+        ScriptArguments,
+        TrlParser,
+        get_dataset,
+        init_zero_verbose,
+    )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__, globals()["__file__"], _import_structure, module_spec=__spec__
+    )
diff --git a/src/aixpert/training/training/trl/scripts/dpo.py b/src/aixpert/training/training/trl/scripts/dpo.py
new file mode 100644
index 0000000..eede371
--- /dev/null
+++ b/src/aixpert/training/training/trl/scripts/dpo.py
@@ -0,0 +1,202 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# /// script
+# dependencies = [
+#     "trl",
+#     "peft",
+#     "trackio",
+#     "kernels",
+# ]
+# ///
+
+"""
+# Full training
+```bash
+python trl/scripts/dpo.py \
+    --dataset_name trl-lib/ultrafeedback_binarized \
+    --model_name_or_path Qwen/Qwen2-0.5B-Instruct \
+    --learning_rate 5.0e-7 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 2 \
+    --max_steps 1000 \
+    --gradient_accumulation_steps 8 \
+    --gradient_checkpointing \
+    --eval_strategy steps \
+    --eval_steps 50 \
+    --output_dir Qwen2-0.5B-DPO \
+    --no_remove_unused_columns
+```
+
+# LoRA:
+```bash
+python trl/scripts/dpo.py \
+    --dataset_name trl-lib/ultrafeedback_binarized \
+    --model_name_or_path Qwen/Qwen2-0.5B-Instruct \
+    --learning_rate 5.0e-6 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 2 \
+    --max_steps 1000 \
+    --gradient_accumulation_steps 8 \
+    --gradient_checkpointing \
+    --eval_strategy steps \
+    --eval_steps 50 \
+    --output_dir Qwen2-0.5B-DPO \
+    --no_remove_unused_columns \
+    --use_peft \
+    --lora_r 32 \
+    --lora_alpha 16
+```
+"""
+
+import argparse
+import os
+
+import torch
+from accelerate import logging
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM
+
+from trl import (
+    DatasetMixtureConfig,
+    DPOConfig,
+    DPOTrainer,
+    ModelConfig,
+    ScriptArguments,
+    TrlParser,
+    get_dataset,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+
+
+logger = logging.get_logger(__name__)
+
+# Enable logging in a Hugging Face Space
+os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")
+
+
+def main(script_args, training_args, model_args, dataset_args):
+    ################
+    # Model
+    ###################
+    dtype = (
+        model_args.dtype
+        if model_args.dtype in ["auto", None]
+        else getattr(torch, model_args.dtype)
+    )
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        attn_implementation=model_args.attn_implementation,
+        dtype=dtype,
+    )
+    quantization_config = get_quantization_config(model_args)
+    if quantization_config is not None:
+        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
+        model_kwargs["device_map"] = get_kbit_device_map()
+        model_kwargs["quantization_config"] = quantization_config
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        trust_remote_code=model_args.trust_remote_code,
+        **model_kwargs,
+    )
+    peft_config = get_peft_config(model_args)
+    if peft_config is None:
+        ref_model = AutoModelForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            trust_remote_code=model_args.trust_remote_code,
+            **model_kwargs,
+        )
+    else:
+        ref_model = None
+    if script_args.ignore_bias_buffers:
+        # torch distributed hack
+        model._ddp_params_and_buffers_to_ignore = [
+            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
+        ]
+
+    # Load the dataset
+    if dataset_args.datasets and script_args.dataset_name:
+        logger.warning(
+            "Both `datasets` and `dataset_name` are provided. The `datasets` argument will be used to load the "
+            "dataset and `dataset_name` will be ignored."
+        )
+        dataset = get_dataset(dataset_args)
+    elif dataset_args.datasets and not script_args.dataset_name:
+        dataset = get_dataset(dataset_args)
+    elif not dataset_args.datasets and script_args.dataset_name:
+        dataset = load_dataset(
+            script_args.dataset_name,
+            name=script_args.dataset_config,
+            streaming=script_args.dataset_streaming,
+        )
+    else:
+        raise ValueError("Either `datasets` or `dataset_name` must be provided.")
+
+    # Initialize the DPO trainer
+    trainer = DPOTrainer(
+        model,
+        ref_model,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split]
+        if training_args.eval_strategy != "no"
+        else None,
+        peft_config=peft_config,
+    )
+
+    # Train the model
+    trainer.train()
+
+    # Log training complete
+    trainer.accelerator.print("✅ Training completed.")
+
+    if training_args.eval_strategy != "no":
+        metrics = trainer.evaluate()
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Save and push to Hub
+    trainer.save_model(training_args.output_dir)
+    trainer.accelerator.print(f"💾 Model saved to {training_args.output_dir}.")
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+        trainer.accelerator.print(
+            f"🤗 Model pushed to the Hub in https://huggingface.co/{trainer.hub_model_id}."
+        )
+
+
+def make_parser(subparsers: argparse._SubParsersAction | None = None):
+    dataclass_types = (ScriptArguments, DPOConfig, ModelConfig, DatasetMixtureConfig)
+    if subparsers is not None:
+        parser = subparsers.add_parser(
+            "dpo", help="Run the DPO training script", dataclass_types=dataclass_types
+        )
+    else:
+        parser = TrlParser(dataclass_types)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = make_parser()
+    # When using the trl cli, this script may be run with additional arguments, corresponding accelerate arguments.
+    # To ensure that their parsing does not interfere with the script arguments, parse the arguments with
+    # `return_remaining_strings=True`, then ignore the remaining strings.
+    script_args, training_args, model_args, dataset_args, _ = (
+        parser.parse_args_and_config(return_remaining_strings=True)
+    )
+    main(script_args, training_args, model_args, dataset_args)
diff --git a/src/aixpert/training/training/trl/scripts/env.py b/src/aixpert/training/training/trl/scripts/env.py
new file mode 100644
index 0000000..4e8af14
--- /dev/null
+++ b/src/aixpert/training/training/trl/scripts/env.py
@@ -0,0 +1,107 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# /// script
+# dependencies = [
+#     "trl",
+# ]
+# ///
+
+import os
+import platform
+from importlib.metadata import version
+
+import torch
+from accelerate.commands.config import default_config_file, load_config_from_file
+from transformers import is_bitsandbytes_available
+from transformers.utils import is_openai_available, is_peft_available
+
+from trl import __version__
+from trl.import_utils import (
+    is_deepspeed_available,
+    is_liger_kernel_available,
+    is_llm_blender_available,
+    is_vllm_available,
+)
+from trl.scripts.utils import get_git_commit_hash
+
+
+def print_env():
+    devices = None
+    if torch.cuda.is_available():
+        devices = [
+            torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())
+        ]
+    elif torch.backends.mps.is_available():
+        devices = ["MPS"]
+    elif torch.xpu.is_available():
+        devices = [
+            torch.xpu.get_device_name(i) for i in range(torch.xpu.device_count())
+        ]
+
+    accelerate_config = accelerate_config_str = "not found"
+
+    # Get the default from the config file.
+    if os.path.isfile(default_config_file):
+        accelerate_config = load_config_from_file(default_config_file).to_dict()
+
+    accelerate_config_str = (
+        "\n"
+        + "\n".join([f"  - {prop}: {val}" for prop, val in accelerate_config.items()])
+        if isinstance(accelerate_config, dict)
+        else accelerate_config
+    )
+
+    commit_hash = get_git_commit_hash("trl")
+
+    info = {
+        "Platform": platform.platform(),
+        "Python version": platform.python_version(),
+        "TRL version": f"{__version__}+{commit_hash[:7]}"
+        if commit_hash
+        else __version__,
+        "PyTorch version": version("torch"),
+        "accelerator(s)": ", ".join(devices) if devices is not None else "cpu",
+        "Transformers version": version("transformers"),
+        "Accelerate version": version("accelerate"),
+        "Accelerate config": accelerate_config_str,
+        "Datasets version": version("datasets"),
+        "HF Hub version": version("huggingface_hub"),
+        "bitsandbytes version": version("bitsandbytes")
+        if is_bitsandbytes_available()
+        else "not installed",
+        "DeepSpeed version": version("deepspeed")
+        if is_deepspeed_available()
+        else "not installed",
+        "Liger-Kernel version": version("liger_kernel")
+        if is_liger_kernel_available()
+        else "not installed",
+        "LLM-Blender version": version("llm_blender")
+        if is_llm_blender_available()
+        else "not installed",
+        "OpenAI version": version("openai")
+        if is_openai_available()
+        else "not installed",
+        "PEFT version": version("peft") if is_peft_available() else "not installed",
+        "vLLM version": version("vllm") if is_vllm_available() else "not installed",
+    }
+
+    info_str = "\n".join([f"- {prop}: {val}" for prop, val in info.items()])
+    print(
+        f"\nCopy-paste the following information when reporting an issue:\n\n{info_str}\n"
+    )  # noqa
+
+
+if __name__ == "__main__":
+    print_env()
diff --git a/src/aixpert/training/training/trl/scripts/grpo.py b/src/aixpert/training/training/trl/scripts/grpo.py
new file mode 100644
index 0000000..f2644dc
--- /dev/null
+++ b/src/aixpert/training/training/trl/scripts/grpo.py
@@ -0,0 +1,193 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# /// script
+# dependencies = [
+#     "trl",
+#     "peft",
+#     "trackio",
+#     "kernels",
+# ]
+# ///
+
+import argparse
+import importlib
+import os
+import sys
+from dataclasses import dataclass, field
+
+from accelerate import logging
+from datasets import load_dataset
+
+from trl import (
+    DatasetMixtureConfig,
+    GRPOConfig,
+    GRPOTrainer,
+    ModelConfig,
+    ScriptArguments,
+    TrlParser,
+    get_dataset,
+    get_peft_config,
+)
+from trl.rewards import (
+    accuracy_reward,
+    get_soft_overlong_punishment,
+    think_format_reward,
+)
+
+
+logger = logging.get_logger(__name__)
+
+# Enable logging in a Hugging Face Space
+os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")
+
+
+reward_funcs_registry = {
+    "accuracy_reward": accuracy_reward,
+    "think_format_reward": think_format_reward,
+    "get_soft_overlong_punishment": get_soft_overlong_punishment(
+        max_completion_len=1280, soft_punish_cache=256
+    ),
+}
+
+
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+
+    Args:
+        reward_model_name_or_path (`str`, *optional*):
+            Reward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a
+            directory containing model weights saved using [`~transformers.PreTrainedModel.save_pretrained`].
+        reward_funcs (`list[str]`, *optional*):
+            Reward functions to use. Supported values are:
+
+                - `"accuracy_reward"`
+                - `"think_format_reward"`
+                - `"get_soft_overlong_punishment"` (used value are `max_completion_len=1280`, `soft_punish_cache=256`)
+                - any dotted import path " (e.g., `'my_lib.rewards.custom_reward'`).
+    """
+
+    reward_model_name_or_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "Reward model id of a pretrained model hosted inside a model repo on huggingface.co or "
+            "local path to a directory containing model weights saved using `PreTrainedModel.save_pretrained`."
+        },
+    )
+    reward_funcs: list[str] | None = field(
+        default=None,
+        metadata={
+            "help": "Reward functions to use. Supported values are: `accuracy_reward`, `think_format_reward`, "
+            "`get_soft_overlong_punishment` (used value are `max_completion_len=1280`, `soft_punish_cache=256`), or "
+            "any dotted import path (e.g., `'my_lib.rewards.custom_reward'`)."
+        },
+    )
+
+
+def main(script_args, training_args, model_args, dataset_args):
+    # Get the reward models and functions
+    reward_funcs = []
+    if script_args.reward_model_name_or_path:
+        reward_funcs.append(script_args.reward_model_name_or_path)
+
+    if script_args.reward_funcs:
+        for func_name in script_args.reward_funcs:
+            if func_name in reward_funcs_registry:
+                reward_funcs.append(reward_funcs_registry[func_name])
+            elif "." in func_name:
+                module_path, func_name = func_name.rsplit(".", 1)
+                sys.path.insert(0, os.getcwd())
+                module = importlib.import_module(module_path)
+                reward_func = getattr(module, func_name)
+                reward_funcs.append(reward_func)
+            else:
+                raise ValueError(
+                    f"Could not load reward function '{func_name}'. Expected one of "
+                    f"{list(reward_funcs_registry.keys())} or a valid import path."
+                )
+
+    # Load the dataset
+    if dataset_args.datasets and script_args.dataset_name:
+        logger.warning(
+            "Both `datasets` and `dataset_name` are provided. The `datasets` argument will be used to load the "
+            "dataset and `dataset_name` will be ignored."
+        )
+        dataset = get_dataset(dataset_args)
+    elif dataset_args.datasets and not script_args.dataset_name:
+        dataset = get_dataset(dataset_args)
+    elif not dataset_args.datasets and script_args.dataset_name:
+        dataset = load_dataset(
+            script_args.dataset_name,
+            name=script_args.dataset_config,
+            streaming=script_args.dataset_streaming,
+        )
+    else:
+        raise ValueError("Either `datasets` or `dataset_name` must be provided.")
+
+    # Initialize the GRPO trainer
+    trainer = GRPOTrainer(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split]
+        if training_args.eval_strategy != "no"
+        else None,
+        peft_config=get_peft_config(model_args),
+    )
+
+    # Train the model
+    trainer.train()
+
+    # Log training complete
+    trainer.accelerator.print("✅ Training completed.")
+
+    # Save and push to Hub
+    trainer.save_model(training_args.output_dir)
+    trainer.accelerator.print(f"💾 Model saved to {training_args.output_dir}.")
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+        trainer.accelerator.print(
+            f"🤗 Model pushed to the Hub in https://huggingface.co/{trainer.hub_model_id}."
+        )
+
+
+def make_parser(subparsers: argparse._SubParsersAction | None = None):
+    dataclass_types = (
+        GRPOScriptArguments,
+        GRPOConfig,
+        ModelConfig,
+        DatasetMixtureConfig,
+    )
+    if subparsers is not None:
+        parser = subparsers.add_parser(
+            "grpo", help="Run the GRPO training script", dataclass_types=dataclass_types
+        )
+    else:
+        parser = TrlParser(dataclass_types)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = make_parser()
+    # When using the trl cli, this script may be run with additional arguments, corresponding accelerate arguments.
+    # To ensure that their parsing does not interfere with the script arguments, parse the arguments with
+    # `return_remaining_strings=True`, then ignore the remaining strings.
+    script_args, training_args, model_args, dataset_args, _ = (
+        parser.parse_args_and_config(return_remaining_strings=True)
+    )
+    main(script_args, training_args, model_args, dataset_args)
diff --git a/src/aixpert/training/training/trl/scripts/kto.py b/src/aixpert/training/training/trl/scripts/kto.py
new file mode 100644
index 0000000..c15efb5
--- /dev/null
+++ b/src/aixpert/training/training/trl/scripts/kto.py
@@ -0,0 +1,174 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# /// script
+# dependencies = [
+#     "trl",
+#     "peft",
+#     "trackio",
+#     "kernels",
+# ]
+# ///
+
+"""
+Run the KTO training script with the commands below. In general, the optimal configuration for KTO will be similar to
+that of DPO.
+
+# Full training:
+```bash
+python trl/scripts/kto.py \
+    --dataset_name trl-lib/kto-mix-14k \
+    --model_name_or_path=trl-lib/qwen1.5-1.8b-sft \
+    --per_device_train_batch_size 16 \
+    --num_train_epochs 1 \
+    --learning_rate 5e-7 \
+    --lr_scheduler_type=cosine \
+    --gradient_accumulation_steps 1 \
+    --eval_steps 500 \
+    --output_dir=kto-aligned-model \
+    --warmup_ratio 0.1 \
+    --logging_first_step
+```
+
+# QLoRA:
+```bash
+# QLoRA:
+python trl/scripts/kto.py \
+    --dataset_name trl-lib/kto-mix-14k \
+    --model_name_or_path=trl-lib/qwen1.5-1.8b-sft \
+    --per_device_train_batch_size 8 \
+    --num_train_epochs 1 \
+    --learning_rate 5e-7 \
+    --lr_scheduler_type=cosine \
+    --gradient_accumulation_steps 1 \
+    --eval_steps 500 \
+    --output_dir=kto-aligned-model-lora \
+    --warmup_ratio 0.1 \
+    --logging_first_step \
+    --use_peft \
+    --load_in_4bit \
+    --lora_target_modules=all-linear \
+    --lora_r=16 \
+    --lora_alpha=16
+```
+"""
+
+import argparse
+import os
+
+from accelerate import logging
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from trl import (
+    DatasetMixtureConfig,
+    KTOConfig,
+    KTOTrainer,
+    ModelConfig,
+    ScriptArguments,
+    TrlParser,
+    get_dataset,
+    get_peft_config,
+)
+
+
+logger = logging.get_logger(__name__)
+
+# Enable logging in a Hugging Face Space
+os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")
+
+
+def main(script_args, training_args, model_args, dataset_args):
+    # Load a pretrained model
+    model = AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+    )
+    ref_model = AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # Load the dataset
+    if dataset_args.datasets and script_args.dataset_name:
+        logger.warning(
+            "Both `datasets` and `dataset_name` are provided. The `datasets` argument will be used to load the "
+            "dataset and `dataset_name` will be ignored."
+        )
+        dataset = get_dataset(dataset_args)
+    elif dataset_args.datasets and not script_args.dataset_name:
+        dataset = get_dataset(dataset_args)
+    elif not dataset_args.datasets and script_args.dataset_name:
+        dataset = load_dataset(
+            script_args.dataset_name,
+            name=script_args.dataset_config,
+            streaming=script_args.dataset_streaming,
+        )
+    else:
+        raise ValueError("Either `datasets` or `dataset_name` must be provided.")
+
+    # Initialize the KTO trainer
+    trainer = KTOTrainer(
+        model,
+        ref_model,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split]
+        if training_args.eval_strategy != "no"
+        else None,
+        processing_class=tokenizer,
+        peft_config=get_peft_config(model_args),
+    )
+
+    # Train the model
+    trainer.train()
+
+    # Log training complete
+    trainer.accelerator.print("✅ Training completed.")
+
+    # Save and push to Hub
+    trainer.save_model(training_args.output_dir)
+    trainer.accelerator.print(f"💾 Model saved to {training_args.output_dir}.")
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+        trainer.accelerator.print(
+            f"🤗 Model pushed to the Hub in https://huggingface.co/{trainer.hub_model_id}."
+        )
+
+
+def make_parser(subparsers: argparse._SubParsersAction | None = None):
+    dataclass_types = (ScriptArguments, KTOConfig, ModelConfig, DatasetMixtureConfig)
+    if subparsers is not None:
+        parser = subparsers.add_parser(
+            "kto", help="Run the KTO training script", dataclass_types=dataclass_types
+        )
+    else:
+        parser = TrlParser(dataclass_types)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = make_parser()
+    # When using the trl cli, this script may be run with additional arguments, corresponding accelerate arguments.
+    # To ensure that their parsing does not interfere with the script arguments, parse the arguments with
+    # `return_remaining_strings=True`, then ignore the remaining strings.
+    script_args, training_args, model_args, dataset_args, _ = (
+        parser.parse_args_and_config(return_remaining_strings=True)
+    )
+    main(script_args, training_args, model_args, dataset_args)
diff --git a/src/aixpert/training/training/trl/scripts/reward.py b/src/aixpert/training/training/trl/scripts/reward.py
new file mode 100644
index 0000000..bbe7fe5
--- /dev/null
+++ b/src/aixpert/training/training/trl/scripts/reward.py
@@ -0,0 +1,116 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# /// script
+# dependencies = [
+#     "trl",
+#     "peft",
+#     "trackio",
+#     "kernels",
+# ]
+# ///
+
+import argparse
+import os
+
+from accelerate import logging
+from datasets import load_dataset
+
+from trl import (
+    DatasetMixtureConfig,
+    ModelConfig,
+    RewardConfig,
+    RewardTrainer,
+    ScriptArguments,
+    TrlParser,
+    get_dataset,
+    get_peft_config,
+)
+
+
+logger = logging.get_logger(__name__)
+
+# Enable logging in a Hugging Face Space
+os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")
+
+
+def main(script_args, training_args, model_args, dataset_args):
+    # Load the dataset
+    if dataset_args.datasets and script_args.dataset_name:
+        logger.warning(
+            "Both `datasets` and `dataset_name` are provided. The `datasets` argument will be used to load the "
+            "dataset and `dataset_name` will be ignored."
+        )
+        dataset = get_dataset(dataset_args)
+    elif dataset_args.datasets and not script_args.dataset_name:
+        dataset = get_dataset(dataset_args)
+    elif not dataset_args.datasets and script_args.dataset_name:
+        dataset = load_dataset(
+            script_args.dataset_name,
+            name=script_args.dataset_config,
+            streaming=script_args.dataset_streaming,
+        )
+    else:
+        raise ValueError("Either `datasets` or `dataset_name` must be provided.")
+
+    # Initialize the RewardTrainer
+    trainer = RewardTrainer(
+        model=model_args.model_name_or_path,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split]
+        if training_args.eval_strategy != "no"
+        else None,
+        peft_config=get_peft_config(model_args),
+    )
+
+    # Train the model
+    trainer.train()
+
+    # Log training complete
+    trainer.accelerator.print("✅ Training completed.")
+
+    # Save and push to Hub
+    trainer.save_model(training_args.output_dir)
+    trainer.accelerator.print(f"💾 Model saved to {training_args.output_dir}.")
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+        trainer.accelerator.print(
+            f"🤗 Model pushed to the Hub in https://huggingface.co/{trainer.hub_model_id}."
+        )
+
+
+def make_parser(subparsers: argparse._SubParsersAction | None = None):
+    dataclass_types = (ScriptArguments, RewardConfig, ModelConfig, DatasetMixtureConfig)
+    if subparsers is not None:
+        parser = subparsers.add_parser(
+            "reward",
+            help="Run the reward training script",
+            dataclass_types=dataclass_types,
+        )
+    else:
+        parser = TrlParser(dataclass_types)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = make_parser()
+    # When using the trl cli, this script may be run with additional arguments, corresponding accelerate arguments.
+    # To ensure that their parsing does not interfere with the script arguments, parse the arguments with
+    # `return_remaining_strings=True`, then ignore the remaining strings.
+    script_args, training_args, model_args, dataset_args, _ = (
+        parser.parse_args_and_config(return_remaining_strings=True)
+    )
+    main(script_args, training_args, model_args, dataset_args)
diff --git a/src/aixpert/training/training/trl/scripts/rloo.py b/src/aixpert/training/training/trl/scripts/rloo.py
new file mode 100644
index 0000000..4fdc574
--- /dev/null
+++ b/src/aixpert/training/training/trl/scripts/rloo.py
@@ -0,0 +1,193 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# /// script
+# dependencies = [
+#     "trl",
+#     "peft",
+#     "trackio",
+#     "kernels",
+# ]
+# ///
+
+import argparse
+import importlib
+import os
+import sys
+from dataclasses import dataclass, field
+
+from accelerate import logging
+from datasets import load_dataset
+
+from trl import (
+    DatasetMixtureConfig,
+    ModelConfig,
+    RLOOConfig,
+    RLOOTrainer,
+    ScriptArguments,
+    TrlParser,
+    get_dataset,
+    get_peft_config,
+)
+from trl.rewards import (
+    accuracy_reward,
+    get_soft_overlong_punishment,
+    think_format_reward,
+)
+
+
+logger = logging.get_logger(__name__)
+
+# Enable logging in a Hugging Face Space
+os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")
+
+
+reward_funcs_registry = {
+    "accuracy_reward": accuracy_reward,
+    "think_format_reward": think_format_reward,
+    "get_soft_overlong_punishment": get_soft_overlong_punishment(
+        max_completion_len=1280, soft_punish_cache=256
+    ),
+}
+
+
+@dataclass
+class RLOOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the RLOO training script.
+
+    Args:
+        reward_model_name_or_path (`str`, *optional*):
+            Reward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a
+            directory containing model weights saved using [`~transformers.PreTrainedModel.save_pretrained`].
+        reward_funcs (`list[str]`, *optional*):
+            Reward functions to use. Supported values are:
+
+                - `"accuracy_reward"`
+                - `"think_format_reward"`
+                - `"get_soft_overlong_punishment"` (used value are `max_completion_len=1280`, `soft_punish_cache=256`)
+                - any dotted import path " (e.g., `'my_lib.rewards.custom_reward'`).
+    """
+
+    reward_model_name_or_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "Reward model id of a pretrained model hosted inside a model repo on huggingface.co or "
+            "local path to a directory containing model weights saved using `PreTrainedModel.save_pretrained`."
+        },
+    )
+    reward_funcs: list[str] | None = field(
+        default=None,
+        metadata={
+            "help": "Reward functions to use. Supported values are: `accuracy_reward`, `think_format_reward`, "
+            "`get_soft_overlong_punishment` (used value are `max_completion_len=1280`, `soft_punish_cache=256`), or "
+            "any dotted import path (e.g., `'my_lib.rewards.custom_reward'`)."
+        },
+    )
+
+
+def main(script_args, training_args, model_args, dataset_args):
+    # Get the reward models and functions
+    reward_funcs = []
+    if script_args.reward_model_name_or_path:
+        reward_funcs.append(script_args.reward_model_name_or_path)
+
+    if script_args.reward_funcs:
+        for func_name in script_args.reward_funcs:
+            if func_name in reward_funcs_registry:
+                reward_funcs.append(reward_funcs_registry[func_name])
+            elif "." in func_name:
+                module_path, func_name = func_name.rsplit(".", 1)
+                sys.path.insert(0, os.getcwd())
+                module = importlib.import_module(module_path)
+                reward_func = getattr(module, func_name)
+                reward_funcs.append(reward_func)
+            else:
+                raise ValueError(
+                    f"Could not load reward function '{func_name}'. Expected one of "
+                    f"{list(reward_funcs_registry.keys())} or a valid import path."
+                )
+
+    # Load the dataset
+    if dataset_args.datasets and script_args.dataset_name:
+        logger.warning(
+            "Both `datasets` and `dataset_name` are provided. The `datasets` argument will be used to load the "
+            "dataset and `dataset_name` will be ignored."
+        )
+        dataset = get_dataset(dataset_args)
+    elif dataset_args.datasets and not script_args.dataset_name:
+        dataset = get_dataset(dataset_args)
+    elif not dataset_args.datasets and script_args.dataset_name:
+        dataset = load_dataset(
+            script_args.dataset_name,
+            name=script_args.dataset_config,
+            streaming=script_args.dataset_streaming,
+        )
+    else:
+        raise ValueError("Either `datasets` or `dataset_name` must be provided.")
+
+    # Initialize the RLOO trainer
+    trainer = RLOOTrainer(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split]
+        if training_args.eval_strategy != "no"
+        else None,
+        peft_config=get_peft_config(model_args),
+    )
+
+    # Train the model
+    trainer.train()
+
+    # Log training complete
+    trainer.accelerator.print("✅ Training completed.")
+
+    # Save and push to Hub
+    trainer.save_model(training_args.output_dir)
+    trainer.accelerator.print(f"💾 Model saved to {training_args.output_dir}.")
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+        trainer.accelerator.print(
+            f"🤗 Model pushed to the Hub in https://huggingface.co/{trainer.hub_model_id}."
+        )
+
+
+def make_parser(subparsers: argparse._SubParsersAction | None = None):
+    dataclass_types = (
+        RLOOScriptArguments,
+        RLOOConfig,
+        ModelConfig,
+        DatasetMixtureConfig,
+    )
+    if subparsers is not None:
+        parser = subparsers.add_parser(
+            "rloo", help="Run the RLOO training script", dataclass_types=dataclass_types
+        )
+    else:
+        parser = TrlParser(dataclass_types)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = make_parser()
+    # When using the trl cli, this script may be run with additional arguments, corresponding accelerate arguments.
+    # To ensure that their parsing does not interfere with the script arguments, parse the arguments with
+    # `return_remaining_strings=True`, then ignore the remaining strings.
+    script_args, training_args, model_args, dataset_args, _ = (
+        parser.parse_args_and_config(return_remaining_strings=True)
+    )
+    main(script_args, training_args, model_args, dataset_args)
diff --git a/src/aixpert/training/training/trl/scripts/sft.py b/src/aixpert/training/training/trl/scripts/sft.py
new file mode 100644
index 0000000..b0a7a0f
--- /dev/null
+++ b/src/aixpert/training/training/trl/scripts/sft.py
@@ -0,0 +1,193 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# /// script
+# dependencies = [
+#     "trl",
+#     "peft",
+#     "trackio",
+#     "kernels",
+# ]
+# ///
+
+"""
+# Full training
+```
+python trl/scripts/sft.py \
+    --model_name_or_path Qwen/Qwen2-0.5B \
+    --dataset_name trl-lib/Capybara \
+    --learning_rate 2.0e-5 \
+    --num_train_epochs 1 \
+    --packing \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 8 \
+    --gradient_checkpointing \
+    --eos_token '<|im_end|>' \
+    --eval_strategy steps \
+    --eval_steps 100 \
+    --output_dir Qwen2-0.5B-SFT \
+    --push_to_hub
+```
+
+# LoRA
+```
+python trl/scripts/sft.py \
+    --model_name_or_path Qwen/Qwen2-0.5B \
+    --dataset_name trl-lib/Capybara \
+    --learning_rate 2.0e-4 \
+    --num_train_epochs 1 \
+    --packing \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 8 \
+    --gradient_checkpointing \
+    --eos_token '<|im_end|>' \
+    --eval_strategy steps \
+    --eval_steps 100 \
+    --use_peft \
+    --lora_r 32 \
+    --lora_alpha 16 \
+    --output_dir Qwen2-0.5B-SFT \
+    --push_to_hub
+```
+"""
+
+import argparse
+import os
+
+from accelerate import logging
+from datasets import load_dataset
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
+)
+
+from trl import (
+    DatasetMixtureConfig,
+    ModelConfig,
+    ScriptArguments,
+    SFTConfig,
+    SFTTrainer,
+    TrlParser,
+    get_dataset,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+
+
+logger = logging.get_logger(__name__)
+
+# Enable logging in a Hugging Face Space
+os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")
+
+
+def main(script_args, training_args, model_args, dataset_args):
+    ################
+    # Model init kwargs
+    ################
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        dtype=model_args.dtype,
+    )
+    quantization_config = get_quantization_config(model_args)
+    if quantization_config is not None:
+        # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
+        model_kwargs["device_map"] = get_kbit_device_map()
+        model_kwargs["quantization_config"] = quantization_config
+
+    # Create model
+    config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+    valid_image_text_architectures = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values()
+
+    if config.architectures and any(
+        arch in valid_image_text_architectures for arch in config.architectures
+    ):
+        from transformers import AutoModelForImageTextToText
+
+        model = AutoModelForImageTextToText.from_pretrained(
+            model_args.model_name_or_path, **model_kwargs
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_args.model_name_or_path, **model_kwargs
+        )
+
+    # Load the dataset
+    if dataset_args.datasets and script_args.dataset_name:
+        logger.warning(
+            "Both `datasets` and `dataset_name` are provided. The `datasets` argument will be used to load the "
+            "dataset and `dataset_name` will be ignored."
+        )
+        dataset = get_dataset(dataset_args)
+    elif dataset_args.datasets and not script_args.dataset_name:
+        dataset = get_dataset(dataset_args)
+    elif not dataset_args.datasets and script_args.dataset_name:
+        dataset = load_dataset(
+            script_args.dataset_name,
+            name=script_args.dataset_config,
+            streaming=script_args.dataset_streaming,
+        )
+    else:
+        raise ValueError("Either `datasets` or `dataset_name` must be provided.")
+
+    # Initialize the SFT trainer
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split]
+        if training_args.eval_strategy != "no"
+        else None,
+        peft_config=get_peft_config(model_args),
+    )
+
+    # Train the model
+    trainer.train()
+
+    # Log training complete
+    trainer.accelerator.print("✅ Training completed.")
+
+    # Save and push to Hub
+    trainer.save_model(training_args.output_dir)
+    trainer.accelerator.print(f"💾 Model saved to {training_args.output_dir}.")
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+        trainer.accelerator.print(
+            f"🤗 Model pushed to the Hub in https://huggingface.co/{trainer.hub_model_id}."
+        )
+
+
+def make_parser(subparsers: argparse._SubParsersAction | None = None):
+    dataclass_types = (ScriptArguments, SFTConfig, ModelConfig, DatasetMixtureConfig)
+    if subparsers is not None:
+        parser = subparsers.add_parser(
+            "sft", help="Run the SFT training script", dataclass_types=dataclass_types
+        )
+    else:
+        parser = TrlParser(dataclass_types)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = make_parser()
+    # When using the trl cli, this script may be run with additional arguments, corresponding accelerate arguments.
+    # To ensure that their parsing does not interfere with the script arguments, parse the arguments with
+    # `return_remaining_strings=True`, then ignore the remaining strings.
+    script_args, training_args, model_args, dataset_args, _ = (
+        parser.parse_args_and_config(return_remaining_strings=True)
+    )
+    main(script_args, training_args, model_args, dataset_args)
diff --git a/src/aixpert/training/training/trl/scripts/utils.py b/src/aixpert/training/training/trl/scripts/utils.py
new file mode 100644
index 0000000..089ba49
--- /dev/null
+++ b/src/aixpert/training/training/trl/scripts/utils.py
@@ -0,0 +1,507 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import importlib
+import inspect
+import logging
+import os
+import subprocess
+import sys
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+
+import datasets
+import yaml
+from datasets import DatasetDict, concatenate_datasets
+from transformers import HfArgumentParser
+from transformers.hf_argparser import DataClass, DataClassType
+from transformers.utils import is_rich_available
+
+
+def _ensure_transformers_parallelism_config() -> None:
+    """
+    Ensure that ``transformers.training_args`` always defines the symbol `ParallelismConfig` so that Python's
+    `typing.get_type_hints` can resolve annotations on `transformers.TrainingArguments` without raising a `NameError`.
+
+    This is needed when running with ``accelerate<1.10.1``, where the module ``accelerate.parallelism_config`` did not
+    exist and therefore the type alias is not imported by Transformers.
+
+    See upstream fix PR in transformers#40818.
+    """
+    from typing import Any
+
+    import transformers.training_args
+
+    if not hasattr(transformers.training_args, "ParallelismConfig"):
+        transformers.training_args.ParallelismConfig = Any
+
+
+_ensure_transformers_parallelism_config()  # before creating HfArgumentParser
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DatasetConfig:
+    """
+    Configuration for a dataset.
+
+    This class matches the signature of [`~datasets.load_dataset`] and the arguments are used directly in the
+    [`~datasets.load_dataset`] function. You can refer to the [`~datasets.load_dataset`] documentation for more
+    details.
+
+    Parameters
+    ----------
+        path (`str`):
+            Path or name of the dataset.
+        name (`str`, *optional*):
+            Defining the name of the dataset configuration.
+        data_dir (`str`, *optional*):
+            Defining the `data_dir` of the dataset configuration. If specified for the generic builders(csv, text etc.)
+            or the Hub datasets and `data_files` is `None`, the behavior is equal to passing `os.path.join(data_dir,
+            **)` as `data_files` to reference all the files in a directory.
+        data_files (`str` or `Sequence` or `Mapping`, *optional*):
+            Path(s) to source data file(s).
+        split (`str`, *optional*, defaults to `"train"`):
+            Which split of the data to load.
+        columns (`list[str]`, *optional*):
+            List of column names to select from the dataset. If `None`, all columns are selected.
+    """
+
+    path: str
+    name: str | None = None
+    data_dir: str | None = None
+    data_files: str | list[str] | dict[str, str] | None = None
+    split: str = "train"
+    columns: list[str] | None = None
+
+
+@dataclass
+class DatasetMixtureConfig:
+    """
+    Configuration class for a mixture of datasets.
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        datasets (`list[DatasetConfig]`):
+            List of dataset configurations to include in the mixture.
+        streaming (`bool`, *optional*, defaults to `False`):
+            Whether to stream the datasets. If `True`, the datasets will be loaded in streaming mode.
+        test_split_size (`float`, *optional*):
+            Size of the test split. Refer to the `test_size` parameter in the [`~datasets.train_test_split`] function
+            for more details. If `None`, the dataset will not be split into train and test sets.
+
+    Usage:
+        When using the CLI, you can add the following section to your YAML config file:
+
+        ```yaml
+        datasets:
+          - path: ...
+            name: ...
+            data_dir: ...
+            data_files: ...
+            split: ...
+            columns: ...
+          - path: ...
+            name: ...
+            data_dir: ...
+            data_files: ...
+            split: ...
+            columns: ...
+        streaming: ...
+        test_split_size: ...
+        ```
+    """
+
+    datasets: list[DatasetConfig] = field(
+        default_factory=list,
+        metadata={"help": "List of dataset configurations to include in the mixture."},
+    )
+    streaming: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to stream the datasets. If True, the datasets will be loaded in streaming mode."
+        },
+    )
+    test_split_size: float | None = field(
+        default=None,
+        metadata={
+            "help": "Size of the test split. Refer to the `test_size` parameter in the `datasets.train_test_split` "
+            "function for more details. If None, the dataset will not be split into train and test sets."
+        },
+    )
+
+    def __post_init__(self):
+        # Convert any dataset dicts (from CLI/config parsing) into DatasetConfig objects
+        for idx, dataset in enumerate(self.datasets):
+            if isinstance(dataset, dict):
+                # If it's a dict, convert it to DatasetConfig
+                self.datasets[idx] = DatasetConfig(**dataset)
+
+
+@dataclass
+class ScriptArguments:
+    """
+    Arguments common to all scripts.
+
+    Args:
+        dataset_name (`str`,, *optional*):
+            Path or name of the dataset to load. If `datasets` is provided, this will be ignored.
+        dataset_config (`str`, *optional*):
+            Dataset configuration name. Corresponds to the `name` argument of the [`~datasets.load_dataset`] function.
+            If `datasets` is provided, this will be ignored.
+        dataset_train_split (`str`, *optional*, defaults to `"train"`):
+            Dataset split to use for training. If `datasets` is provided, this will be ignored.
+        dataset_test_split (`str`, *optional*, defaults to `"test"`):
+            Dataset split to use for evaluation. If `datasets` is provided, this will be ignored.
+        dataset_streaming (`bool`, *optional*, defaults to `False`):
+            Whether to stream the dataset. If True, the dataset will be loaded in streaming mode. If `datasets` is
+            provided, this will be ignored.
+        gradient_checkpointing_use_reentrant (`bool`, *optional*, defaults to `False`):
+            Whether to apply `use_reentrant` for gradient checkpointing.
+        ignore_bias_buffers (`bool`, *optional*, defaults to `False`):
+            Debug argument for distributed training. Fix for DDP issues with LM bias/mask buffers - invalid scalar
+            type, inplace operation. See
+            https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992.
+    """
+
+    dataset_name: str | None = field(
+        default=None,
+        metadata={
+            "help": "Path or name of the dataset to load. If `datasets` is provided, this will be ignored."
+        },
+    )
+    dataset_config: str | None = field(
+        default=None,
+        metadata={
+            "help": "Dataset configuration name. Corresponds to the `name` argument of the `datasets.load_dataset` "
+            "function. If `datasets` is provided, this will be ignored."
+        },
+    )
+    dataset_train_split: str = field(
+        default="train",
+        metadata={
+            "help": "Dataset split to use for training. If `datasets` is provided, this will be ignored."
+        },
+    )
+    dataset_test_split: str = field(
+        default="test",
+        metadata={
+            "help": "Dataset split to use for evaluation. If `datasets` is provided, this will be ignored."
+        },
+    )
+    dataset_streaming: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to stream the dataset. If True, the dataset will be loaded in streaming mode. If "
+            "`datasets` is provided, this will be ignored."
+        },
+    )
+    gradient_checkpointing_use_reentrant: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to apply `use_reentrant` for gradient checkpointing."
+        },
+    )
+    ignore_bias_buffers: bool = field(
+        default=False,
+        metadata={
+            "help": "Debug argument for distributed training. Fix for DDP issues with LM bias/mask buffers - invalid "
+            "scalar type, inplace operation. See "
+            "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992."
+        },
+    )
+
+
+def init_zero_verbose():
+    """
+    Perform zero verbose init - use this method on top of the CLI modules to make logging and warning output cleaner.
+    Uses Rich if available, falls back otherwise.
+    """
+    import logging
+    import warnings
+
+    FORMAT = "%(message)s"
+
+    if is_rich_available():
+        from rich.logging import RichHandler
+
+        handler = RichHandler()
+    else:
+        handler = logging.StreamHandler()
+
+    logging.basicConfig(
+        format=FORMAT, datefmt="[%X]", handlers=[handler], level=logging.ERROR
+    )
+
+    # Custom warning handler to redirect warnings to the logging system
+    def warning_handler(message, category, filename, lineno, file=None, line=None):
+        logging.warning(f"{filename}:{lineno}: {category.__name__}: {message}")
+
+    # Add the custom warning handler - we need to do that before importing anything to make sure the loggers work well
+    warnings.showwarning = warning_handler
+
+
+class TrlParser(HfArgumentParser):
+    """
+    A subclass of [`transformers.HfArgumentParser`] designed for parsing command-line arguments with dataclass-backed
+    configurations, while also supporting configuration file loading and environment variable management.
+
+    Args:
+        dataclass_types (`DataClassType | Iterable[DataClassType]`, *optional*):
+            Dataclass types to use for argument parsing.
+        **kwargs:
+            Additional keyword arguments passed to the [`transformers.HfArgumentParser`] constructor.
+
+    Examples
+    --------
+    ```yaml
+    # config.yaml
+    env:
+        VAR1: value1
+    arg1: 23
+    ```
+
+    ```python
+    # main.py
+    import os
+    from dataclasses import dataclass
+    from trl import TrlParser
+
+
+    @dataclass
+    class MyArguments:
+        arg1: int
+        arg2: str = "alpha"
+
+
+    parser = TrlParser(dataclass_types=[MyArguments])
+    training_args = parser.parse_args_and_config()
+
+    print(training_args, os.environ.get("VAR1"))
+    ```
+
+    ```bash
+    $ python main.py --config config.yaml
+    (MyArguments(arg1=23, arg2='alpha'),) value1
+
+    $ python main.py --arg1 5 --arg2 beta
+    (MyArguments(arg1=5, arg2='beta'),) None
+    ```
+    """
+
+    def __init__(
+        self,
+        dataclass_types: DataClassType | Iterable[DataClassType] | None = None,
+        **kwargs,
+    ):
+        # Make sure dataclass_types is an iterable
+        if dataclass_types is None:
+            dataclass_types = []
+        elif not isinstance(dataclass_types, Iterable):
+            dataclass_types = [dataclass_types]
+
+        # Check that none of the dataclasses have the "config" field
+        for dataclass_type in dataclass_types:
+            if "config" in dataclass_type.__dataclass_fields__:
+                raise ValueError(
+                    f"Dataclass {dataclass_type.__name__} has a field named 'config'. This field is reserved for the "
+                    f"config file path and should not be used in the dataclass."
+                )
+
+        super().__init__(dataclass_types=dataclass_types, **kwargs)
+
+    def parse_args_and_config(
+        self,
+        args: Iterable[str] | None = None,
+        return_remaining_strings: bool = False,
+        fail_with_unknown_args: bool = True,
+    ) -> tuple[DataClass, ...]:
+        """
+        Parse command-line args and config file into instances of the specified dataclass types.
+
+        This method wraps [`transformers.HfArgumentParser.parse_args_into_dataclasses`] and also parses the config file
+        specified with the `--config` flag. The config file (in YAML format) provides argument values that replace the
+        default values in the dataclasses. Command line arguments can override values set by the config file. The
+        method also sets any environment variables specified in the `env` field of the config file.
+        """
+        args = list(args) if args is not None else sys.argv[1:]
+        if "--config" in args:
+            # Get the config file path from
+            config_index = args.index("--config")
+            args.pop(config_index)  # remove the --config flag
+            config_path = args.pop(config_index)  # get the path to the config file
+            with open(config_path) as yaml_file:
+                config = yaml.safe_load(yaml_file)
+
+            # Set the environment variables specified in the config file
+            if "env" in config:
+                env_vars = config.pop("env", {})
+                if not isinstance(env_vars, dict):
+                    raise ValueError("`env` field should be a dict in the YAML file.")
+                for key, value in env_vars.items():
+                    os.environ[key] = str(value)
+
+            # Set the defaults from the config values
+            config_remaining_strings = self.set_defaults_with_config(**config)
+        else:
+            config_remaining_strings = []
+
+        # Parse the arguments from the command line
+        output = self.parse_args_into_dataclasses(
+            args=args, return_remaining_strings=return_remaining_strings
+        )
+
+        # Merge remaining strings from the config file with the remaining strings from the command line
+        if return_remaining_strings:
+            args_remaining_strings = output[-1]
+            return output[:-1] + (config_remaining_strings + args_remaining_strings,)
+        if fail_with_unknown_args and config_remaining_strings:
+            raise ValueError(
+                f"Unknown arguments from config file: {config_remaining_strings}. Please remove them, add them to the "
+                "dataclass, or set `fail_with_unknown_args=False`."
+            )
+        return output
+
+    def set_defaults_with_config(self, **kwargs) -> list[str]:
+        """
+        Overrides the parser's default values with those provided via keyword arguments, including for subparsers.
+
+        Any argument with an updated default will also be marked as not required if it was previously required.
+
+        Returns a list of strings that were not consumed by the parser.
+        """
+
+        def apply_defaults(parser, kw):
+            used_keys = set()
+            for action in parser._actions:
+                # Handle subparsers recursively
+                if isinstance(action, argparse._SubParsersAction):
+                    for subparser in action.choices.values():
+                        used_keys.update(apply_defaults(subparser, kw))
+                elif action.dest in kw:
+                    action.default = kw[action.dest]
+                    action.required = False
+                    used_keys.add(action.dest)
+            return used_keys
+
+        used_keys = apply_defaults(self, kwargs)
+        # Remaining args not consumed by the parser
+        remaining = [
+            item
+            for key, value in kwargs.items()
+            if key not in used_keys
+            for item in (f"--{key}", str(value))
+        ]
+        return remaining
+
+
+def get_git_commit_hash(package_name):
+    try:
+        # Import the package to locate its path
+        package = importlib.import_module(package_name)
+        # Get the path to the package using inspect
+        package_path = os.path.dirname(inspect.getfile(package))
+
+        # Navigate up to the Git repository root if the package is inside a subdirectory
+        git_repo_path = os.path.abspath(os.path.join(package_path, ".."))
+        git_dir = os.path.join(git_repo_path, ".git")
+
+        if os.path.isdir(git_dir):
+            # Run the git command to get the current commit hash
+            commit_hash = (
+                subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=git_repo_path)
+                .strip()
+                .decode("utf-8")
+            )
+            return commit_hash
+        return None
+    except Exception as e:
+        return f"Error: {str(e)}"
+
+
+def get_dataset(mixture_config: DatasetMixtureConfig) -> DatasetDict:
+    """
+    Load a mixture of datasets based on the configuration.
+
+    Args:
+        mixture_config ([`DatasetMixtureConfig`]):
+            Script arguments containing dataset configuration.
+
+    Returns
+    -------
+        [`~datasets.DatasetDict`]:
+            Combined dataset(s) from the mixture configuration, with optional train/test split if `test_split_size` is
+            set.
+
+    Example:
+    ```python
+    from trl import DatasetMixtureConfig, get_dataset
+    from trl.scripts.utils import DatasetConfig
+
+    mixture_config = DatasetMixtureConfig(datasets=[DatasetConfig(path="trl-lib/tldr")])
+    dataset = get_dataset(mixture_config)
+    print(dataset)
+    ```
+
+    ```
+    DatasetDict(
+        {train: Dataset({features: ["prompt", "completion"], num_rows: 116722})}
+    )
+    ```
+    """
+    logger.info(
+        f"Creating dataset mixture with {len(mixture_config.datasets)} datasets"
+    )
+    datasets_list = []
+    for dataset_config in mixture_config.datasets:
+        logger.info(
+            f"Loading dataset for mixture: {dataset_config.path} (config name: {dataset_config.name})"
+        )
+        dataset = datasets.load_dataset(
+            path=dataset_config.path,
+            name=dataset_config.name,
+            data_dir=dataset_config.data_dir,
+            data_files=dataset_config.data_files,
+            split=dataset_config.split,
+            streaming=mixture_config.streaming,
+        )
+        if dataset_config.columns is not None:
+            dataset = dataset.select_columns(dataset_config.columns)
+        datasets_list.append(dataset)
+
+    if datasets_list:
+        combined_dataset = concatenate_datasets(datasets_list)
+        if isinstance(
+            combined_dataset, datasets.Dataset
+        ):  # IterableDataset does not have a length
+            logger.info(
+                f"Created dataset mixture with {len(combined_dataset)} examples"
+            )
+
+        if mixture_config.test_split_size is not None:
+            logger.info(
+                f"Splitting dataset into train and test sets with test size: {mixture_config.test_split_size}"
+            )
+            combined_dataset = combined_dataset.train_test_split(
+                test_size=mixture_config.test_split_size
+            )
+            return combined_dataset
+        return DatasetDict({"train": combined_dataset})
+    raise ValueError("No datasets were loaded from the mixture configuration")
diff --git a/src/aixpert/training/training/trl/scripts/vllm_serve.py b/src/aixpert/training/training/trl/scripts/vllm_serve.py
new file mode 100644
index 0000000..c39c6f0
--- /dev/null
+++ b/src/aixpert/training/training/trl/scripts/vllm_serve.py
@@ -0,0 +1,965 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import base64
+import logging
+import os
+from collections.abc import Sequence
+from contextlib import asynccontextmanager
+from dataclasses import dataclass, field
+from io import BytesIO
+from itertools import chain
+from multiprocessing import Pipe, Process
+from multiprocessing.connection import Connection
+
+import torch
+import torch.distributed.distributed_c10d as c10d
+from transformers import is_torch_xpu_available, is_vision_available
+
+from trl import TrlParser
+from trl.import_utils import (
+    is_fastapi_available,
+    is_pydantic_available,
+    is_uvicorn_available,
+    is_vllm_ascend_available,
+    is_vllm_available,
+)
+
+
+if is_fastapi_available():
+    from fastapi import FastAPI
+
+
+if is_pydantic_available():
+    from pydantic import BaseModel
+
+
+if is_uvicorn_available():
+    import uvicorn
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+if is_vllm_available():
+    from vllm import LLM, SamplingParams
+    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+    from vllm.distributed.parallel_state import get_world_group
+    from vllm.distributed.utils import StatelessProcessGroup
+    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.utils import get_open_port
+
+    if is_vllm_ascend_available():
+        from vllm_ascend.distributed.device_communicators.pyhccl import (
+            PyHcclCommunicator as PyNcclCommunicator,
+        )
+
+
+logger = logging.getLogger(__name__)
+
+# We use CUDA with multiprocessing, so we must use the 'spawn' start method. Otherwise, we will get the following
+# error: RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use
+# the 'spawn' start method
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+class WeightSyncWorkerExtension:
+    """
+    A vLLM worker extension that enables weight synchronization between a client and multiple server workers.
+
+    This worker uses a `StatelessProcessGroup` to establish communication and a `PyNcclCommunicator` or
+    `ProcessGroupXCCL` to handle efficient GPU-based communication using NCCL. The primary purpose of this class is to
+    receive updated model weights from a client process and distribute them to all worker processes participating in
+    model inference.
+    """
+
+    # The following attributes are initialized when `init_communicator` method is called.
+    communicator = None  # Communicator for weight updates
+    client_rank = None  # Source rank for broadcasting updated weights
+
+    def init_communicator(
+        self, host: str, port: int, world_size: int, client_device_uuid: str
+    ) -> None:
+        """
+        Initializes the weight update communicator using a stateless process group.
+
+        This method creates a `StatelessProcessGroup` that allows external training processes to communicate with vLLM
+        workers without interfering with the global torch distributed group.
+
+        Args:
+            host (`str`):
+                Hostname or IP address of the master node.
+            port (`int`):
+                Port number to be used for communication.
+            world_size (`int`):
+                Total number of participating processes in the update group.
+            client_device_uuid (`str`):
+                UUID of the device of client main process. Used to assert that devices are different from vllm workers
+                devices.
+        """
+        if self.communicator is not None:
+            raise RuntimeError(
+                "Weight update group already initialized. Call close_communicator first."
+            )
+
+        # TODO: will remove after torch xpu 2.9 support uuid in get_device_properties
+        if torch.cuda.is_available() or (
+            is_torch_xpu_available()
+            and hasattr(torch.xpu.get_device_properties(self.device), "uuid")
+        ):
+            accelerator_module = torch.xpu if is_torch_xpu_available() else torch.cuda
+            if client_device_uuid == str(
+                accelerator_module.get_device_properties(self.device).uuid
+            ):
+                raise RuntimeError(
+                    f"Attempting to use the same CUDA device (UUID: {client_device_uuid}) for multiple distinct "
+                    "roles/ranks within the same communicator. This setup is unsupported and will likely lead to program "
+                    "hangs or incorrect behavior. Ensure that trainer is using different devices than vLLM server."
+                )
+        # Get the rank of the current worker in the global world group.
+        rank = get_world_group().rank
+
+        if is_torch_xpu_available():
+            store = torch.distributed.TCPStore(
+                host_name=host, port=port, world_size=world_size, is_master=(rank == 0)
+            )
+            prefixed_store = c10d.PrefixStore("client2server", store)
+            pg = c10d.ProcessGroupXCCL(
+                store=prefixed_store,
+                rank=rank,
+                size=world_size,
+            )
+            self.communicator = pg
+        else:
+            # Create a stateless process group to manage communication between training processes and vLLM workers.
+            # Initialize the NCCL-based communicator for weight synchronization.
+            pg = StatelessProcessGroup.create(
+                host=host, port=port, rank=rank, world_size=world_size
+            )
+            self.communicator = PyNcclCommunicator(pg, device=self.device)
+
+        # The client process that sends updated weights has the highest rank (world_size - 1).
+        self.client_rank = world_size - 1
+
+    def update_named_param(self, name: str, dtype: str, shape: Sequence[int]) -> None:
+        """
+        Receives updated weights from the client process and updates the named parameter in the model.
+
+        Args:
+            name (`str`):
+                Name of the weight tensor being updated.
+            dtype (`str`):
+                Data type of the weight tensor as a string (e.g., `"torch.float32"`).
+            shape (`Sequence[int]`):
+                Shape of the weight tensor.
+        """
+        if self.communicator is None:
+            raise RuntimeError(
+                "Communicator not initialized. Call `init_communicator` first."
+            )
+
+        dtype = getattr(torch, dtype.split(".")[-1])
+        # Allocate memory for the incoming weight tensor on the correct device.
+        weight = torch.empty(shape, dtype=dtype, device=self.device)
+
+        if is_torch_xpu_available():
+            # Use XCCL to broadcast the updated weights from the client (src) to all workers.
+            self.communicator.broadcast(weight, root=self.client_rank)
+            self.communicator.barrier()
+        else:
+            # Use NCCL to broadcast the updated weights from the client (src) to all workers.
+            self.communicator.broadcast(weight, src=self.client_rank)
+            self.communicator.group.barrier()
+
+        # Load the received weights into the model.
+        self.model_runner.model.load_weights(weights=[(name, weight)])
+
+    def close_communicator(self) -> None:
+        """
+        Closes the communicator when weight synchronization is no longer needed.
+
+        This method deletes the NCCL communicator to release associated resources.
+        """
+        if self.communicator is not None:
+            del self.communicator
+            self.communicator = None  # Ensure attribute is reset to None
+            self.client_rank = None  # Ensure attribute is reset to None
+
+
+@dataclass
+class ScriptArguments:
+    r"""
+    Arguments for the script.
+
+    Args:
+        model (`str`):
+            Model name or path to load the model from.
+        revision (`str`, *optional*):
+            Revision to use for the model. If not specified, the default branch will be used.
+        tensor_parallel_size (`int`, *optional*, defaults to `1`):
+            Number of tensor parallel workers to use.
+        data_parallel_size (`int`, *optional*, defaults to `1`):
+            Number of data parallel workers to use.
+        host (`str`, *optional*, defaults to `"0.0.0.0"`):
+            Host address to run the server on.
+        port (`int`, *optional*, defaults to `8000`):
+            Port to run the server on.
+        gpu_memory_utilization (`float`, *optional*, defaults to `0.9`):
+            Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache on the
+            device dedicated to generation powered by vLLM. Higher values will increase the KV cache size and thus
+            improve the model's throughput. However, if the value is too high, it may cause out-of-memory (OOM) errors
+            during initialization.
+        dtype (`str`, *optional*, defaults to `"auto"`):
+            Data type to use for vLLM generation. If set to `"auto"`, the data type will be automatically determined
+            based on the model configuration. Find the supported values in the vLLM documentation.
+        max_model_len (`int`, *optional*):
+            If set, the `max_model_len` to use for vLLM. This can be useful when running with reduced
+            `vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model
+            context size, which might be much larger than the KV cache, leading to inefficiencies.
+        enable_prefix_caching (`bool`, *optional*):
+            Whether to enable prefix caching in vLLM. If set to `True`, ensure that the model and the hardware support
+            this feature.
+        enforce_eager (`bool`, *optional*, defaults to `False`):
+            Whether to enforce eager execution. If set to `True`, we will disable CUDA graph and always execute the
+            model in eager mode. If `False` (default behavior), we will use CUDA graph and eager execution in hybrid.
+        vllm_model_impl (`str`, *optional*, defaults to `"vllm"`):
+            Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use
+            the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model
+            implementation.
+        kv_cache_dtype (`str`, *optional*, defaults to `"auto"`):
+            Data type to use for KV cache. If set to `"auto"`, the dtype will default to the model data type.
+        trust_remote_code (`bool`, *optional*, defaults to `False`):
+            Whether to trust remote code when loading models. Set to `True` to allow executing code from model
+            repositories. This is required for some custom models but introduces security risks.
+        log_level (`str`, *optional*, defaults to `"info"`):
+            Log level for uvicorn. Possible choices: `"critical"`, `"error"`, `"warning"`, `"info"`, `"debug"`,
+            `"trace"`.
+    """
+
+    model: str = field(
+        metadata={"help": "Model name or path to load the model from."},
+    )
+    revision: str | None = field(
+        default=None,
+        metadata={
+            "help": "Revision to use for the model. If not specified, the default branch will be used."
+        },
+    )
+    tensor_parallel_size: int = field(
+        default=1,
+        metadata={"help": "Number of tensor parallel workers to use."},
+    )
+    data_parallel_size: int = field(
+        default=1,
+        metadata={"help": "Number of data parallel workers to use."},
+    )
+    host: str = field(
+        default="0.0.0.0",
+        metadata={"help": "Host address to run the server on."},
+    )
+    port: int = field(
+        default=8000,
+        metadata={"help": "Port to run the server on."},
+    )
+    gpu_memory_utilization: float = field(
+        default=0.9,
+        metadata={
+            "help": "Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV "
+            "cache on the device dedicated to generation powered by vLLM. Higher values will increase the KV cache "
+            "size and thus improve the model's throughput. However, if the value is too high, it may cause "
+            "out-of-memory (OOM) errors during initialization."
+        },
+    )
+    dtype: str = field(
+        default="auto",
+        metadata={
+            "help": "Data type to use for vLLM generation. If set to 'auto', the data type will be automatically "
+            "determined based on the model configuration. Find the supported values in the vLLM documentation."
+        },
+    )
+    max_model_len: int | None = field(
+        default=None,
+        metadata={
+            "help": "If set, the `max_model_len` to use for vLLM. This can be useful when running with reduced "
+            "`vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model "
+            "context size, which might be much larger than the KV cache, leading to inefficiencies."
+        },
+    )
+    enable_prefix_caching: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to enable prefix caching in vLLM. If set to `True`, ensure that the model and the "
+            "hardware support this feature."
+        },
+    )
+    enforce_eager: bool | None = field(
+        default=False,
+        metadata={
+            "help": "Whether to enforce eager execution. If set to `True`, we will disable CUDA graph and always "
+            "execute the model in eager mode. If `False` (default behavior), we will use CUDA graph and eager "
+            "execution in hybrid."
+        },
+    )
+    kv_cache_dtype: str = field(
+        default="auto",
+        metadata={
+            "help": "Data type to use for KV cache. If set to 'auto', the dtype will default to the model data type."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to trust remote code when loading models. Set to True to allow executing code from model "
+            "repositories. This is required for some custom models but introduces security risks."
+        },
+    )
+    log_level: str = field(
+        default="info",
+        metadata={
+            "help": "Log level for uvicorn. Possible choices: 'critical', 'error', 'warning', 'info', 'debug', "
+            "'trace'."
+        },
+    )
+    vllm_model_impl: str = field(
+        default="vllm",
+        metadata={
+            "help": "Model implementation to use for vLLM. Must be one of `transformers` or `vllm`. `transformers`: "
+            "Use the `transformers` backend for model implementation. `vllm`: Use the `vllm` library for "
+            "model implementation."
+        },
+    )
+
+
+def llm_worker(
+    script_args: ScriptArguments,
+    data_parallel_rank: int,
+    master_port: int,
+    connection: Connection,
+) -> None:
+    # Set required environment variables for DP to work with vLLM
+    os.environ["VLLM_DP_RANK"] = str(data_parallel_rank)
+    os.environ["VLLM_DP_RANK_LOCAL"] = str(data_parallel_rank)
+    os.environ["VLLM_DP_SIZE"] = str(script_args.data_parallel_size)
+    os.environ["VLLM_DP_MASTER_PORT"] = str(master_port)
+
+    llm = LLM(
+        model=script_args.model,
+        revision=script_args.revision,
+        tensor_parallel_size=script_args.tensor_parallel_size,
+        gpu_memory_utilization=script_args.gpu_memory_utilization,
+        enforce_eager=script_args.enforce_eager,
+        dtype=script_args.dtype,
+        # Automatic Prefix Caching caches the KV cache of existing queries, so that a new query can
+        # directly reuse the KV cache if it shares the same prefix with one of the existing queries.
+        # This is particularly useful here because we generate completions from the same prompts.
+        enable_prefix_caching=script_args.enable_prefix_caching,
+        kv_cache_dtype=script_args.kv_cache_dtype,
+        max_model_len=script_args.max_model_len,
+        worker_extension_cls="trl.scripts.vllm_serve.WeightSyncWorkerExtension",
+        trust_remote_code=script_args.trust_remote_code,
+        model_impl=script_args.vllm_model_impl,
+    )
+
+    # Send ready signal to parent process
+    connection.send({"status": "ready"})
+
+    while True:
+        # Wait for commands from the parent process
+        try:
+            command = connection.recv()
+        except KeyboardInterrupt:
+            llm.collective_rpc(method="close_communicator")
+            break
+
+        # Handle commands
+        if command["type"] in ["call", "fire_and_forget"]:
+            method_name = command["method"]
+            args, kwargs = command.get("args", ()), command.get("kwargs", {})
+            method = getattr(llm, method_name)
+            result = method(*args, **kwargs)
+            if command["type"] == "call":
+                connection.send(result)
+        elif command["type"] == "shutdown":
+            break
+
+
+def chunk_list(lst: list, n: int) -> list[list]:
+    """
+    Split list `lst` into `n` evenly distributed sublists.
+
+    Example:
+    ```python
+    >>> chunk_list([1, 2, 3, 4, 5, 6], 2)
+    [[1, 2, 3], [4, 5, 6]]
+
+    >>> chunk_list([1, 2, 3, 4, 5, 6], 4)
+    [[1, 2], [3, 4], [5], [6]]
+
+    >>> chunk_list([1, 2, 3, 4, 5, 6], 8)
+    [[1], [2], [3], [4], [5], [6], [], []]
+    ```
+    """
+    k, r = divmod(len(lst), n)
+    return [lst[i * k + min(i, r) : (i + 1) * k + min(i + 1, r)] for i in range(n)]
+
+
+def sanitize_logprob(logprob):
+    import math
+
+    value = logprob.logprob
+    if math.isnan(value):
+        logger.warning(
+            f"Generated NaN logprob, token logprob '{logprob}' will be ignored"
+        )
+        return None
+
+    return value
+
+
+def main(script_args: ScriptArguments):
+    if not is_fastapi_available():
+        raise ImportError(
+            "FastAPI is required to run the vLLM serve script. Please install it using `pip install fastapi`."
+        )
+
+    if not is_pydantic_available():
+        raise ImportError(
+            "Pydantic is required to run the vLLM serve script. Please install it using `pip install pydantic`."
+        )
+
+    if not is_uvicorn_available():
+        raise ImportError(
+            "Uvicorn is required to run the vLLM serve script. Please install it using `pip install uvicorn`."
+        )
+
+    if not is_vllm_available():
+        raise ImportError(
+            "vLLM is required to run the vLLM serve script. Please install it using `pip install vllm`."
+        )
+
+    # Spawn dp workers, and setup pipes for communication
+    master_port = get_open_port()
+    connections = []
+    processes = []
+    for data_parallel_rank in range(script_args.data_parallel_size):
+        parent_connection, child_connection = Pipe()
+        process = Process(
+            target=llm_worker,
+            args=(script_args, data_parallel_rank, master_port, child_connection),
+        )
+        process.start()
+        connections.append(parent_connection)
+        processes.append(process)
+
+    @asynccontextmanager
+    async def lifespan(app: FastAPI):
+        # Wait for all workers to send "ready"
+        ready_connections = set()
+        while len(ready_connections) < script_args.data_parallel_size:
+            for connection in connections:
+                msg = connection.recv()
+                if isinstance(msg, dict) and msg.get("status") == "ready":
+                    ready_connections.add(connection)
+
+        yield
+
+        # Wait for processes to terminate
+        for process in processes:
+            process.join(timeout=10)  # Wait for 10 seconds for the process to terminate
+            if process.is_alive():
+                logger.warning(
+                    f"Process {process} is still alive after 10 seconds, attempting to terminate..."
+                )
+                process.terminate()
+                process.join()  # ensure process termination after calling terminate()
+
+    app = FastAPI(lifespan=lifespan)
+
+    # Define the endpoints for the model server
+    @app.get("/health/")
+    async def health():
+        """
+        Health check endpoint to verify that the server is running.
+        """
+        return {"status": "ok"}
+
+    @app.get("/get_world_size/")
+    async def get_world_size():
+        """
+        Retrieves the world size of the LLM engine, which is `tensor_parallel_size * data_parallel_size`.
+
+        Returns
+        -------
+            `dict`:
+                A dictionary containing the world size.
+
+        Example response:
+        ```json
+        {"world_size": 8}
+        ```
+        """
+        return {
+            "world_size": script_args.tensor_parallel_size
+            * script_args.data_parallel_size
+        }
+
+    class GenerateRequest(BaseModel):
+        prompts: list[str]
+        images: list[str] | None = None
+        n: int = 1
+        repetition_penalty: float = 1.0
+        temperature: float = 1.0
+        top_p: float = 1.0
+        top_k: int = -1
+        min_p: float = 0.0
+        max_tokens: int = 16
+        truncate_prompt_tokens: int | None = None
+        guided_decoding_regex: str | None = None
+        generation_kwargs: dict = field(default_factory=dict)
+
+    class GenerateResponse(BaseModel):
+        prompt_ids: list[list[int]]
+        completion_ids: list[list[int]]
+        logprobs: list[list[float]]
+
+    @app.post("/generate/", response_model=GenerateResponse)
+    async def generate(request: GenerateRequest):
+        """
+        Generates completions for the provided prompts.
+
+        Args:
+            request (`GenerateRequest`):
+                - `prompts` (list of `str`): A list of prompts (text strings) for the model to generate completions.
+                - `images` (list of `str`, *optional*, default to `None`): A list of base64 encoded images to process
+                  along with prompts.
+                - `n` (`int`, *optional*, defaults to `1`): Number of completions to generate for each prompt.
+                - `repetition_penalty` (`float`, *optional*, defaults to `1.0`): Repetition penalty to apply during
+                  generation.
+                - `temperature` (`float`, *optional*, defaults to `1.0`): Temperature for sampling. Higher values lead
+                  to more random outputs.
+                - `top_p` (`float`, *optional*, defaults to `1.0`): Top-p (nucleus) sampling parameter. It controls the
+                  diversity of the generated text.
+                - `top_k` (`int`, *optional*, defaults to `-1`): Top-k sampling parameter. If set to `-1`, it disables
+                  top-k sampling.
+                - `min_p` (`float`, *optional*, defaults to `0.0`): Minimum probability threshold for sampling.
+                - `max_tokens` (`int`, *optional*, defaults to `16`): Maximum number of tokens to generate for each
+                  completion.
+                - `truncate_prompt_tokens` (`int`, *optional*): If set to `-1`, will use the truncation size supported
+                  by the model. If set to an integer k, will use only the last k tokens from the prompt (i.e., left
+                  truncation). If set to `None`, truncation is disabled.
+                - `guided_decoding_regex` (`str`, *optional*): A regex pattern for guided decoding. If provided, the
+                  model will only generate tokens that match this regex pattern.
+                - `generation_kwargs` (`dict`, *optional*): Additional generation parameters to pass to the vLLM
+                  `SamplingParams`. This can include parameters like `seed`, `frequency_penalty`, etc. If it contains
+                  keys that conflict with the other parameters, they will override them.
+
+        Returns
+        -------
+            `GenerateResponse`:
+                - `prompt_ids` (list of list of `int`): A list of lists of token IDs for each input prompt.
+                - `completion_ids` (list of list of `int`): A list of lists of token IDs for each generated completion.
+                - `logprobs` (list of list of `float`): A list of lists of log probabilities for each token in the
+                  generated completions.
+
+        Example request:
+        ```json
+        {"prompts": ["Hello world", "What is AI?"]}
+        ```
+
+        Example response:
+        ```json
+        {
+          "prompt_ids": [[101, 102], [201, 202]],
+          "completion_ids": [[103, 104, 105], [203, 204, 205]],
+          "logprobs": [[-0.1, -0.2, -0.3], [-0.4, -0.5, -0.6]]
+        }
+        ```
+        """
+        request.images = request.images or [None] * len(request.prompts)
+
+        prompts = []
+        for prompt, image in zip(request.prompts, request.images, strict=True):
+            row = {"prompt": prompt}
+            if image is not None:
+                row["multi_modal_data"] = {
+                    "image": Image.open(BytesIO(base64.b64decode(image)))
+                }
+            prompts.append(row)
+
+        # Guided decoding, if enabled
+        if request.guided_decoding_regex is not None:
+            guided_decoding = GuidedDecodingParams(regex=request.guided_decoding_regex)
+        else:
+            guided_decoding = None
+
+        generation_kwargs = {
+            "n": request.n,
+            "repetition_penalty": request.repetition_penalty,
+            "temperature": request.temperature,
+            "top_p": request.top_p,
+            "top_k": request.top_k,
+            "min_p": request.min_p,
+            "max_tokens": request.max_tokens,
+            "truncate_prompt_tokens": request.truncate_prompt_tokens,
+            "guided_decoding": guided_decoding,
+            "logprobs": 0,  # enable returning log probabilities; 0 means for the sampled tokens only
+        }
+        generation_kwargs.update(request.generation_kwargs)
+        sampling_params = SamplingParams(**generation_kwargs)
+
+        # Evenly distribute prompts across DP ranks
+        chunked_prompts = chunk_list(prompts, script_args.data_parallel_size)
+
+        # Send the prompts to each worker
+        for connection, prompts in zip(connections, chunked_prompts, strict=True):
+            # When the number of prompts is less than data_parallel_size, some workers will receive empty prompts.
+            # However, vLLM requires that we always send at least one prompt. So we send a placeholder prompt to comply
+            # with vLLM's requirement, and we later ignore the result.
+            if not prompts:
+                prompts = ["<placeholder>"]
+            kwargs = {"prompts": prompts, "sampling_params": sampling_params}
+            connection.send({"type": "call", "method": "generate", "kwargs": kwargs})
+
+        # Receive results
+        all_outputs = [connection.recv() for connection in connections]
+
+        # Handle empty prompts (see above)
+        all_outputs = [
+            output
+            for output, prompts in zip(all_outputs, chunked_prompts, strict=True)
+            if prompts
+        ]
+
+        # Flatten and combine all results
+        all_outputs = list(
+            chain.from_iterable(all_outputs)
+        )  # from list of list to single list
+        prompt_ids = [output.prompt_token_ids for output in all_outputs]
+        completion_ids = [
+            list(output.token_ids)
+            for outputs in all_outputs
+            for output in outputs.outputs
+        ]
+        logprobs: list[list[float]] = [
+            [
+                sanitize_logprob(next(iter(logprob.values())))
+                for logprob in output.logprobs
+            ]
+            for outputs in all_outputs
+            for output in outputs.outputs
+        ]
+        return {
+            "prompt_ids": prompt_ids,
+            "completion_ids": completion_ids,
+            "logprobs": logprobs,
+        }
+
+    class ChatRequest(BaseModel):
+        messages: list[list[dict]]
+        n: int = 1
+        repetition_penalty: float = 1.0
+        temperature: float = 1.0
+        top_p: float = 1.0
+        top_k: int = -1
+        min_p: float = 0.0
+        max_tokens: int = 16
+        truncate_prompt_tokens: int | None = None
+        guided_decoding_regex: str | None = None
+        generation_kwargs: dict = field(default_factory=dict)
+        chat_template_kwargs: dict = field(default_factory=dict)
+
+    class ChatResponse(BaseModel):
+        prompt_ids: list[list[int]]
+        completion_ids: list[list[int]]
+        logprobs: list[list[float]]
+
+    @app.post("/chat/", response_model=ChatResponse)
+    async def chat(request: ChatRequest):
+        """
+        Generates completions for the provided chat messages.
+
+        Args:
+            request (`ChatRequest`):
+                - `messages` (list of `dict`): A list of messages (dicts with "role" and "content" keys) for the model
+                  to generate completions.
+                - `n` (`int`, *optional*, defaults to `1`): Number of completions to generate for each prompt.
+                - `repetition_penalty` (`float`, *optional*, defaults to `1.0`): Repetition penalty to apply during
+                  generation.
+                - `temperature` (`float`, *optional*, defaults to `1.0`): Temperature for sampling. Higher values lead
+                  to more random outputs.
+                - `top_p` (`float`, *optional*, defaults to `1.0`): Top-p (nucleus) sampling parameter. It controls the
+                  diversity of the generated text.
+                - `top_k` (`int`, *optional*, defaults to `-1`): Top-k sampling parameter. If set to `-1`, it disables
+                  top-k sampling.
+                - `min_p` (`float`, *optional*, defaults to `0.0`): Minimum probability threshold for sampling.
+                - `max_tokens` (`int`, *optional*, defaults to `16`): Maximum number of tokens to generate for each
+                  completion.
+                - `truncate_prompt_tokens` (`int`, *optional*): If set to `-1`, will use the truncation size supported
+                  by the model. If set to an integer k, will use only the last k tokens from the prompt (i.e., left
+                  truncation). If set to `None`, truncation is disabled.
+                - `guided_decoding_regex` (`str`, *optional*): A regex pattern for guided decoding. If provided, the
+                  model will only generate tokens that match this regex pattern.
+                - `generation_kwargs` (`dict`, *optional*): Additional generation parameters to pass to the vLLM
+                  `SamplingParams`. This can include parameters like `seed`, `frequency_penalty`, etc. If it contains
+                  keys that conflict with the other parameters, they will override them.
+                - `chat_template_kwargs` (`dict`, *optional*): Additional keyword arguments to pass to the chat
+                  template.
+
+        Returns
+        -------
+            `ChatResponse`:
+                - `prompt_ids` (list of list of `int`): A list of lists of token IDs for each input prompt.
+                - `completion_ids` (list of list of `int`): A list of lists of token IDs for each generated completion.
+                - `logprobs` (list of list of `float`): A list of lists of log probabilities for each token in the
+                  generated completions.
+
+        Example request:
+        ```bash
+        curl -X POST 'http://0.0.0.0:8000/chat/' \
+          -H 'Content-Type: application/json' \
+          -d '{"messages": [[{ "role": "user", "content": "Hello!" }]]}'
+        ```
+
+        Example response:
+        ```json
+        {
+            "prompt_ids": [[151644, 872, 198, 9707, 0, 151645, 198, 151644, 77091, 198]],
+            "completion_ids":[[151667, 198, 32313, 11, 279, 1196, 1101, 1053, 330, 9707, 8958, 773, 358, 1184, 311, 5889]],
+            "logprobs": [[-0.00029404606902971864, -3.576278118089249e-07, -0.09024181962013245, -6.389413465512916e-05, -0.038671817630529404, -0.00013314791431184858, -0.5868351459503174, -0.09682723134756088, -0.06609706580638885, -0.00023803261865396053, -0.02242819033563137, -0.8185162544250488, -0.04954879730939865, -0.3169460594654083, -4.887569048150908e-06, -0.006023705471307039]]
+        }
+        ```
+        """
+        # Convert PIL images to base64 strings
+        for message_list in request.messages:
+            for message in message_list:
+                if isinstance(message["content"], list):
+                    for part in message["content"]:
+                        if part["type"] == "image_pil":
+                            part["image_pil"] = Image.open(
+                                BytesIO(base64.b64decode(part["image_pil"]))
+                            )
+
+        # Guided decoding, if enabled
+        if request.guided_decoding_regex is not None:
+            guided_decoding = GuidedDecodingParams(regex=request.guided_decoding_regex)
+        else:
+            guided_decoding = None
+
+        generation_kwargs = {
+            "n": request.n,
+            "repetition_penalty": request.repetition_penalty,
+            "temperature": request.temperature,
+            "top_p": request.top_p,
+            "top_k": request.top_k,
+            "min_p": request.min_p,
+            "max_tokens": request.max_tokens,
+            "truncate_prompt_tokens": request.truncate_prompt_tokens,
+            "guided_decoding": guided_decoding,
+            "logprobs": 0,  # enable returning log probabilities; 0 means for the sampled tokens only
+        }
+        generation_kwargs.update(request.generation_kwargs)
+        sampling_params = SamplingParams(**generation_kwargs)
+
+        # Evenly distribute prompts across DP ranks
+        chunked_messages = chunk_list(request.messages, script_args.data_parallel_size)
+
+        # Send the messages to each worker
+        for connection, messages in zip(connections, chunked_messages, strict=True):
+            # When the number of messages is less than data_parallel_size, some workers will receive empty messages.
+            # However, vLLM requires that we always send at least one prompt. So we send a placeholder prompt to comply
+            # with vLLM's requirement, and we later ignore the result.
+            if not messages:
+                messages = [[{"role": "user", "content": "<placeholder>"}]]
+            kwargs = {
+                "messages": messages,
+                "sampling_params": sampling_params,
+                "chat_template_kwargs": request.chat_template_kwargs,
+            }
+            connection.send({"type": "call", "method": "chat", "kwargs": kwargs})
+
+        # Receive results
+        all_outputs = [connection.recv() for connection in connections]
+
+        # Handle empty prompts (see above)
+        all_outputs = [
+            output
+            for output, prompts in zip(all_outputs, chunked_messages, strict=True)
+            if prompts
+        ]
+
+        # Flatten and combine all results
+        all_outputs = list(
+            chain.from_iterable(all_outputs)
+        )  # from list of list to single list
+        prompt_ids = [output.prompt_token_ids for output in all_outputs]
+        completion_ids = [
+            list(output.token_ids)
+            for outputs in all_outputs
+            for output in outputs.outputs
+        ]
+        logprobs: list[list[float]] = [
+            [
+                sanitize_logprob(next(iter(logprob.values())))
+                for logprob in output.logprobs
+            ]
+            for outputs in all_outputs
+            for output in outputs.outputs
+        ]
+        return {
+            "prompt_ids": prompt_ids,
+            "completion_ids": completion_ids,
+            "logprobs": logprobs,
+        }
+
+    class InitCommunicatorRequest(BaseModel):
+        host: str
+        port: int
+        world_size: int
+        client_device_uuid: str
+
+    @app.post("/init_communicator/")
+    async def init_communicator(request: InitCommunicatorRequest):
+        """
+        Initializes the communicator for synchronizing model weights between a client and multiple server workers.
+
+        Args:
+            request (`InitCommunicatorRequest`):
+                - `host` (`str`): Hostname or IP address of the master node.
+                - `port` (`int`): Port number to be used for communication.
+                - `world_size` (`int`): Total number of participating processes in the group.
+                - `client_device_uuid` (`str`): UUID of the device of client main process. Used to assert that devices
+                  are different from vLLM workers devices.
+        """
+        world_size = (
+            script_args.tensor_parallel_size * script_args.data_parallel_size + 1
+        )
+
+        # The function init_communicator is called this way: init_communicator(host, port, world_size)
+        # So with collective_rpc we need to call it this way:
+        # llm.collective_rpc(method="init_communicator", args=(host, port, world_size))
+        kwargs = {
+            "method": "init_communicator",
+            "args": (
+                request.host,
+                request.port,
+                world_size,
+                request.client_device_uuid,
+            ),
+        }
+        for connection in connections:
+            connection.send(
+                {
+                    "type": "fire_and_forget",
+                    "method": "collective_rpc",
+                    "kwargs": kwargs,
+                }
+            )
+
+        return {"message": "Request received, initializing communicator"}
+
+    class UpdateWeightsRequest(BaseModel):
+        name: str
+        dtype: str
+        shape: list[int]
+
+    @app.post("/update_named_param/")
+    async def update_named_param(request: UpdateWeightsRequest):
+        """
+        Updates the model weights with the provided tensor.
+
+        Once this endpoint is called, the client process should broadcast the updated weights to all server workers.
+
+        Args:
+            request (`UpdateWeightsRequest`):
+                - `name` (`str`): Name of the weight tensor being updated.
+                - `dtype` (`str`): Data type of the weight tensor (e.g., `"torch.float32"`).
+                - `shape` (list of `int`): Shape of the weight
+
+        """
+        # The function update_named_param is called this way: update_named_param("name", "torch.float32", (10, 10))
+        # So with collective_rpc we need to call it this way:
+        # llm.collective_rpc("update_named_param", args=("name", "torch.float32", (10, 10)))
+        kwargs = {
+            "method": "update_named_param",
+            "args": (request.name, request.dtype, tuple(request.shape)),
+        }
+        for connection in connections:
+            connection.send(
+                {
+                    "type": "fire_and_forget",
+                    "method": "collective_rpc",
+                    "kwargs": kwargs,
+                }
+            )
+
+        return {"message": "Request received, updating named parameter"}
+
+    @app.post("/reset_prefix_cache/")
+    async def reset_prefix_cache():
+        """
+        Resets the prefix cache for the model.
+        """
+        for connection in connections:
+            connection.send({"type": "call", "method": "reset_prefix_cache"})
+        # Wait for and collect all results
+        all_outputs = [connection.recv() for connection in connections]
+        success = all(output for output in all_outputs)
+        return {
+            "message": "Request received, resetting prefix cache status: "
+            + str(success)
+        }
+
+    @app.post("/close_communicator/")
+    async def close_communicator():
+        """
+        Closes the weight update group and cleans up associated resources.
+        """
+        kwargs = {"method": "close_communicator"}
+        for connection in connections:
+            connection.send(
+                {
+                    "type": "fire_and_forget",
+                    "method": "collective_rpc",
+                    "kwargs": kwargs,
+                }
+            )
+        return {"message": "Request received, closing communicator"}
+
+    # Start the server
+    uvicorn.run(
+        app,
+        host=script_args.host,
+        port=script_args.port,
+        log_level=script_args.log_level,
+    )
+
+
+def make_parser(subparsers: argparse._SubParsersAction | None = None):
+    if subparsers is not None:
+        parser = subparsers.add_parser(
+            "vllm-serve",
+            help="Run the vLLM serve script",
+            dataclass_types=ScriptArguments,
+        )
+    else:
+        parser = TrlParser(ScriptArguments)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = make_parser()
+    (script_args,) = parser.parse_args_and_config()
+    main(script_args)
diff --git a/src/aixpert/training/training/trl/templates/lm_model_card.md b/src/aixpert/training/training/trl/templates/lm_model_card.md
new file mode 100644
index 0000000..c52842d
--- /dev/null
+++ b/src/aixpert/training/training/trl/templates/lm_model_card.md
@@ -0,0 +1,55 @@
+---
+{{ card_data }}
+---
+
+# Model Card for {{ model_name }}
+
+This model is a fine-tuned version of [{{ base_model }}](https://huggingface.co/{{ base_model }}){% if dataset_name %} on the [{{ dataset_name }}](https://huggingface.co/datasets/{{ dataset_name }}) dataset{% endif %}.
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="{{ hub_model_id }}", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+{% if wandb_url %}[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>]({{ wandb_url }}){% endif %}
+{% if comet_url %}[<img src="https://raw.githubusercontent.com/comet-ml/comet-examples/master/logo/comet_badge.png" alt="Visualize in Comet" width="135" height="20"/>]({{ comet_url }}){% endif %}
+
+This model was trained with {{ trainer_name }}{% if paper_id %}, a method introduced in [{{ paper_title }}](https://huggingface.co/papers/{{ paper_id }}){% endif %}.
+
+### Framework versions
+
+- TRL: {{ trl_version }}
+- Transformers: {{ transformers_version }}
+- Pytorch: {{ pytorch_version }}
+- Datasets: {{ datasets_version }}
+- Tokenizers: {{ tokenizers_version }}
+
+## Citations
+
+{% if trainer_citation %}Cite {{ trainer_name }} as:
+
+```bibtex
+{{ trainer_citation }}
+```{% endif %}
+
+Cite TRL as:
+
+```bibtex
+{% raw %}@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}{% endraw %}
+```
diff --git a/src/aixpert/training/training/trl/templates/rm_model_card.md b/src/aixpert/training/training/trl/templates/rm_model_card.md
new file mode 100644
index 0000000..deef8dd
--- /dev/null
+++ b/src/aixpert/training/training/trl/templates/rm_model_card.md
@@ -0,0 +1,55 @@
+---
+{{ card_data }}
+---
+
+# Model Card for {{ model_name }}
+
+This model is a fine-tuned version of [{{ base_model }}](https://huggingface.co/{{ base_model }}){% if dataset_name %} on the [{{ dataset_name }}](https://huggingface.co/datasets/{{ dataset_name }}) dataset{% endif %}.
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+text = "The capital of France is Paris."
+rewarder = pipeline(model="{{ hub_model_id }}", device="cuda")
+output = rewarder(text)[0]
+print(output["score"])
+```
+
+## Training procedure
+
+{% if wandb_url %}[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>]({{ wandb_url }}){% endif %}
+{% if comet_url %}[<img src="https://raw.githubusercontent.com/comet-ml/comet-examples/master/logo/comet_badge.png" alt="Visualize in Comet" width="135" height="20"/>]({{ comet_url }}){% endif %}
+
+This model was trained with {{ trainer_name }}{% if paper_id %}, a method introduced in [{{ paper_title }}](https://huggingface.co/papers/{{ paper_id }}){% endif %}.
+
+### Framework versions
+
+- TRL: {{ trl_version }}
+- Transformers: {{ transformers_version }}
+- Pytorch: {{ pytorch_version }}
+- Datasets: {{ datasets_version }}
+- Tokenizers: {{ tokenizers_version }}
+
+## Citations
+
+{% if trainer_citation %}Cite {{ trainer_name }} as:
+
+```bibtex
+{{ trainer_citation }}
+```{% endif %}
+
+Cite TRL as:
+
+```bibtex
+{% raw %}@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}{% endraw %}
+```
diff --git a/src/aixpert/training/training/trl/trainer/__init__.py b/src/aixpert/training/training/trl/trainer/__init__.py
new file mode 100644
index 0000000..914493b
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/__init__.py
@@ -0,0 +1,143 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ..import_utils import _LazyModule
+
+
+_import_structure = {
+    "bco_config": ["BCOConfig"],
+    "bco_trainer": ["BCOTrainer"],
+    "callbacks": [
+        "BEMACallback",
+        "LogCompletionsCallback",
+        "MergeModelCallback",
+        "RichProgressCallback",
+        "SyncRefModelCallback",
+        "WeaveCallback",
+        "WinRateCallback",
+    ],
+    "cpo_config": ["CPOConfig"],
+    "cpo_trainer": ["CPOTrainer"],
+    "dpo_config": ["DPOConfig", "FDivergenceConstants", "FDivergenceType"],
+    "dpo_trainer": ["DPOTrainer"],
+    "gkd_config": ["GKDConfig"],
+    "gkd_trainer": ["GKDTrainer"],
+    "grpo_config": ["GRPOConfig"],
+    "grpo_trainer": ["GRPOTrainer"],
+    "judges": [
+        "AllTrueJudge",
+        "BaseBinaryJudge",
+        "BaseJudge",
+        "BasePairwiseJudge",
+        "BaseRankJudge",
+        "HfPairwiseJudge",
+        "OpenAIPairwiseJudge",
+        "PairRMJudge",
+    ],
+    "kto_config": ["KTOConfig"],
+    "kto_trainer": ["KTOTrainer"],
+    "model_config": ["ModelConfig"],
+    "nash_md_config": ["NashMDConfig"],
+    "nash_md_trainer": ["NashMDTrainer"],
+    "online_dpo_config": ["OnlineDPOConfig"],
+    "online_dpo_trainer": ["OnlineDPOTrainer"],
+    "orpo_config": ["ORPOConfig"],
+    "orpo_trainer": ["ORPOTrainer"],
+    "ppo_config": ["PPOConfig"],
+    "ppo_trainer": ["PPOTrainer"],
+    "prm_config": ["PRMConfig"],
+    "prm_trainer": ["PRMTrainer"],
+    "reward_config": ["RewardConfig"],
+    "reward_trainer": ["RewardTrainer"],
+    "rloo_config": ["RLOOConfig"],
+    "rloo_trainer": ["RLOOTrainer"],
+    "sft_config": ["SFTConfig"],
+    "sft_trainer": ["SFTTrainer"],
+    "utils": [
+        "RunningMoments",
+        "compute_accuracy",
+        "disable_dropout_in_model",
+        "empty_cache",
+        "peft_module_casting_to_bf16",
+    ],
+    "xpo_config": ["XPOConfig"],
+    "xpo_trainer": ["XPOTrainer"],
+}
+
+if TYPE_CHECKING:
+    from .bco_config import BCOConfig
+    from .bco_trainer import BCOTrainer
+    from .callbacks import (
+        BEMACallback,
+        LogCompletionsCallback,
+        MergeModelCallback,
+        RichProgressCallback,
+        SyncRefModelCallback,
+        WeaveCallback,
+        WinRateCallback,
+    )
+    from .cpo_config import CPOConfig
+    from .cpo_trainer import CPOTrainer
+    from .dpo_config import DPOConfig, FDivergenceConstants, FDivergenceType
+    from .dpo_trainer import DPOTrainer
+    from .gkd_config import GKDConfig
+    from .gkd_trainer import GKDTrainer
+    from .grpo_config import GRPOConfig
+    from .grpo_trainer import GRPOTrainer
+    from .judges import (
+        AllTrueJudge,
+        BaseBinaryJudge,
+        BaseJudge,
+        BasePairwiseJudge,
+        BaseRankJudge,
+        HfPairwiseJudge,
+        OpenAIPairwiseJudge,
+        PairRMJudge,
+    )
+    from .kto_config import KTOConfig
+    from .kto_trainer import KTOTrainer
+    from .model_config import ModelConfig
+    from .nash_md_config import NashMDConfig
+    from .nash_md_trainer import NashMDTrainer
+    from .online_dpo_config import OnlineDPOConfig
+    from .online_dpo_trainer import OnlineDPOTrainer
+    from .orpo_config import ORPOConfig
+    from .orpo_trainer import ORPOTrainer
+    from .ppo_config import PPOConfig
+    from .ppo_trainer import PPOTrainer
+    from .prm_config import PRMConfig
+    from .prm_trainer import PRMTrainer
+    from .reward_config import RewardConfig
+    from .reward_trainer import RewardTrainer
+    from .rloo_config import RLOOConfig
+    from .rloo_trainer import RLOOTrainer
+    from .sft_config import SFTConfig
+    from .sft_trainer import SFTTrainer
+    from .utils import (
+        RunningMoments,
+        compute_accuracy,
+        disable_dropout_in_model,
+        empty_cache,
+        peft_module_casting_to_bf16,
+    )
+    from .xpo_config import XPOConfig
+    from .xpo_trainer import XPOTrainer
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__, globals()["__file__"], _import_structure, module_spec=__spec__
+    )
diff --git a/src/aixpert/training/training/trl/trainer/base_trainer.py b/src/aixpert/training/training/trl/trainer/base_trainer.py
new file mode 100644
index 0000000..ea60e69
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/base_trainer.py
@@ -0,0 +1,88 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from transformers import Trainer, is_wandb_available
+
+from .utils import generate_model_card, get_comet_experiment_url, get_config_model_id
+
+
+if is_wandb_available():
+    import wandb
+
+
+class BaseTrainer(Trainer):
+    _tag_names = []
+    _name = "Base"
+    _paper = {}
+    _template_file = None
+
+    def create_model_card(
+        self,
+        model_name: str | None = None,
+        dataset_name: str | None = None,
+        tags: str | list[str] | None = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str`, *optional*):
+                Name of the model.
+            dataset_name (`str`, *optional*):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]`, *optional*):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        model_name_or_path = get_config_model_id(self.model.config)
+        if model_name_or_path and not os.path.isdir(model_name_or_path):
+            base_model = model_name_or_path
+        else:
+            base_model = None
+
+        # Normalize tags
+        if tags is None:
+            tags = set()
+        elif isinstance(tags, str):
+            tags = {tags}
+        else:
+            tags = set(tags)
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.add("unsloth")
+        if "JOB_ID" in os.environ:
+            tags.add("hf_jobs")
+        tags.update(self._tag_names)
+        tags = list(tags)
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.url
+            if is_wandb_available() and wandb.run is not None
+            else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name=self._name,
+            trainer_citation=self._paper.get("citation"),
+            template_file=self._template_file,
+            paper_title=self._paper.get("title"),
+            paper_id=self._paper.get("id"),
+        )
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
diff --git a/src/aixpert/training/training/trl/trainer/bco_config.py b/src/aixpert/training/training/trl/trainer/bco_config.py
new file mode 100644
index 0000000..6e22dda
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/bco_config.py
@@ -0,0 +1,29 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass
+
+from ..experimental.bco import BCOConfig as _BCOConfig
+
+
+@dataclass
+class BCOConfig(_BCOConfig):
+    def __post_init__(self):
+        warnings.warn(
+            "The `BCOConfig` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.bco import BCOConfig`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
+        )
+        super().__post_init__()
diff --git a/src/aixpert/training/training/trl/trainer/bco_trainer.py b/src/aixpert/training/training/trl/trainer/bco_trainer.py
new file mode 100644
index 0000000..c98fc32
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/bco_trainer.py
@@ -0,0 +1,29 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass
+
+from ..experimental.bco import BCOTrainer as _BCOTrainer
+
+
+@dataclass
+class BCOTrainer(_BCOTrainer):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `BCOTrainer` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.bco import BCOTrainer`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
+        )
+        super().__init__(*args, **kwargs)
diff --git a/src/aixpert/training/training/trl/trainer/callbacks.py b/src/aixpert/training/training/trl/trainer/callbacks.py
new file mode 100644
index 0000000..2151f01
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/callbacks.py
@@ -0,0 +1,1146 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+
+import pandas as pd
+import torch
+from accelerate import Accelerator
+from accelerate.state import AcceleratorState
+from accelerate.utils import gather_object, is_wandb_available
+from transformers import (
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Trainer,
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+from transformers.trainer_utils import has_length
+from transformers.utils import is_rich_available
+
+from ..data_utils import maybe_apply_chat_template
+from ..import_utils import is_mergekit_available, is_weave_available
+from ..mergekit_utils import MergeConfig, merge_models, upload_model_to_hf
+from ..models.utils import unwrap_model_for_generation
+from .utils import get_config_model_id, log_table_to_comet_experiment
+
+
+if is_rich_available():
+    from rich.columns import Columns
+    from rich.console import Console, Group
+    from rich.live import Live
+    from rich.panel import Panel
+    from rich.progress import Progress
+    from rich.table import Table
+
+if is_wandb_available():
+    import wandb
+
+if is_weave_available():
+    import weave
+    from weave import EvaluationLogger
+    from weave.trace.context import weave_client_context
+
+
+# Logger for module-level logging
+logger = logging.getLogger(__name__)
+
+
+def _generate_completions(
+    prompts: list[str],
+    model: PreTrainedModel,
+    tokenizer: PreTrainedTokenizerBase,
+    accelerator: Accelerator,
+    generation_config: GenerationConfig | None,
+    batch_size: int = 1,
+) -> list[str]:
+    """
+    Generates completions for a list of pre-formatted prompts from the given model.
+
+    Args:
+        prompts (list[str]): A list of input prompts for which completions are to be generated.
+        model (PreTrainedModel): The pre-trained model to be used for generation.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for encoding and decoding.
+        accelerator (Accelerator): The accelerator to be used for model execution.
+        generation_config (GenerationConfig): Configuration for text generation.
+        batch_size (int, optional): The number of prompts to process in each batch. Default is 1.
+
+    Returns
+    -------
+        list[str]: A list of generated text completions corresponding to the input prompts.
+    """
+    completions = []
+    with unwrap_model_for_generation(model, accelerator) as unwrapped_model:
+        for idx in range(0, len(prompts), batch_size):
+            batch = prompts[idx : idx + batch_size]
+            tokenized_batch = tokenizer(
+                batch, return_tensors="pt", padding=True, truncation=True
+            ).to(model.device)
+            generations = unwrapped_model.generate(
+                **tokenized_batch,
+                generation_config=generation_config,
+            )
+            for prompt, generation in zip(
+                tokenized_batch.input_ids, generations, strict=True
+            ):
+                # Remove prompt from generation
+                generation = generation[len(prompt) :]
+                completion = tokenizer.decode(generation, skip_special_tokens=True)
+                completions.append(completion)
+    return completions
+
+
+class SyncRefModelCallback(TrainerCallback):
+    """
+    Callback to synchronize the model with a reference model.
+    """
+
+    def __init__(
+        self,
+        ref_model: PreTrainedModel | torch.nn.Module,
+        accelerator: Accelerator | None,
+    ):
+        self.accelerator = accelerator
+        self.ref_model = ref_model
+
+    @staticmethod
+    def _sync_target_model(model, target_model, alpha):
+        for target_param, copy_param in zip(
+            target_model.parameters(), model.parameters(), strict=True
+        ):
+            target_param.data.mul_(1.0 - alpha).add_(copy_param.data, alpha=alpha)
+
+    @staticmethod
+    def sync_target_model(model, target_model, alpha):
+        deepspeed_plugin = AcceleratorState().deepspeed_plugin
+        if deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3:
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(
+                list(model.parameters()) + list(target_model.parameters()),
+                modifier_rank=0,
+            ):
+                if deepspeed.comm.get_rank() == 0:
+                    SyncRefModelCallback._sync_target_model(model, target_model, alpha)
+        else:
+            SyncRefModelCallback._sync_target_model(model, target_model, alpha)
+
+    def on_step_end(self, args, state, control, **kwargs):
+        model: PreTrainedModel = kwargs["model"]
+
+        if (
+            self.ref_model is not None
+            and state.global_step % args.ref_model_sync_steps == 0
+        ):
+            if self.accelerator:
+                model = self.accelerator.unwrap_model(model)
+            self.sync_target_model(model, self.ref_model, args.ref_model_mixup_alpha)
+
+
+class RichProgressCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that displays the progress of training or evaluation using Rich.
+    """
+
+    def __init__(self):
+        if not is_rich_available():
+            raise ImportError(
+                "RichProgressCallback requires the `rich` extra. To install, run `pip install rich`."
+            )
+
+        self.training_bar = None
+        self.evaluation_bar = None
+        self.training_task = None
+        self.evaluation_task = None
+        self.rich_group = None
+        self.rich_console = None
+        self.training_status = None
+        self.current_step = None
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        self.training_bar = Progress()
+        self.evaluation_bar = Progress()
+        self.rich_console = Console()
+        self.training_status = self.rich_console.status("Nothing to log yet ...")
+        self.rich_group = Live(
+            Panel(Group(self.training_bar, self.evaluation_bar, self.training_status))
+        )
+        self.rich_group.start()
+        self.training_task = self.training_bar.add_task(
+            "[blue]Training  ", total=state.max_steps
+        )
+        self.current_step = 0
+
+    def on_step_end(self, args, state, control, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        self.training_bar.update(
+            self.training_task,
+            advance=state.global_step - self.current_step,
+            update=True,
+        )
+        self.current_step = state.global_step
+
+    def on_prediction_step(self, args, state, control, eval_dataloader=None, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        if has_length(eval_dataloader):
+            if self.evaluation_task is None:
+                self.evaluation_task = self.evaluation_bar.add_task(
+                    "[blue]Evaluation", total=len(eval_dataloader)
+                )
+            self.evaluation_bar.update(self.evaluation_task, advance=1, update=True)
+
+    def on_evaluate(self, args, state, control, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        if self.evaluation_task is not None:
+            self.evaluation_bar.remove_task(self.evaluation_task)
+            self.evaluation_task = None
+
+    def on_predict(self, args, state, control, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        if self.evaluation_task is not None:
+            self.evaluation_bar.remove_task(self.evaluation_task)
+            self.evaluation_task = None
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if not (state.is_world_process_zero and self.training_bar):
+            return
+
+        # Group keys by top-level prefix
+        grouped_logs = {}
+        for key, value in logs.items():
+            parts = key.split("/")
+            group = parts[0] if len(parts) > 1 else None
+            subkey = "/".join(parts[1:]) if len(parts) > 1 else key
+            grouped_logs.setdefault(group, {})[subkey] = value
+
+        # Create a table per group
+        tables = []
+        for group_name, metrics in grouped_logs.items():
+            table = Table(
+                title=f"[bold blue]{group_name}[/]" if group_name else None,
+                header_style="bold magenta",
+                box=None,
+            )
+            table.add_column("Metric", justify="left", no_wrap=True)
+            table.add_column("Value", justify="right")
+
+            for metric, val in metrics.items():
+                formatted = f"{val:.3f}" if isinstance(val, (float, int)) else str(val)
+                table.add_row(metric, formatted)
+
+            tables.append(Panel(table, border_style="cyan", padding=(0, 1)))
+
+        # Arrange tables in columns using Columns
+        column_layout = Columns(tables, equal=False, expand=True)
+        self.training_status.update(
+            Panel(
+                column_layout,
+                title=f"[bold green]Step {state.global_step}[/bold green]",
+                border_style="green",
+            )
+        )
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        self.rich_group.stop()
+        self.training_bar = None
+        self.evaluation_bar = None
+        self.training_task = None
+        self.evaluation_task = None
+        self.rich_group = None
+        self.rich_console = None
+        self.training_status = None
+        self.current_step = None
+
+
+def _win_rate_completions_df(
+    state: TrainerState,
+    prompts: list[str],
+    completions: list[str],
+    winner_indices: list[str],
+) -> pd.DataFrame:
+    global_step = [str(state.global_step)] * len(prompts)
+    data = list(zip(global_step, prompts, completions, winner_indices, strict=True))
+    # Split completions from reference model and policy
+    split_data = [(item[0], item[1], item[2][0], item[2][1], item[3]) for item in data]
+    return pd.DataFrame(
+        split_data,
+        columns=["step", "prompt", "reference_model", "policy", "winner_index"],
+    )
+
+
+class WinRateCallback(TrainerCallback):
+    """
+    A [`~transformers.TrainerCallback`] that computes the win rate of a model based on a reference.
+
+    It generates completions using prompts from the evaluation dataset and compares the trained model's outputs against
+    a reference. The reference is either the initial version of the model (before training) or the reference model, if
+    available in the trainer. During each evaluation step, a judge determines how often the trained model's completions
+    win against the reference using a judge. The win rate is then logged in the trainer's logs under the key
+    `"eval_win_rate"`.
+
+    Usage:
+    ```python
+    trainer = DPOTrainer(...)
+    judge = PairRMJudge()
+    win_rate_callback = WinRateCallback(judge=judge, trainer=trainer)
+    trainer.add_callback(win_rate_callback)
+    ```
+
+    Args:
+        judge ([`experimental.judges.BasePairwiseJudge`]):
+            The judge to use for comparing completions.
+        trainer (`Trainer`):
+            Trainer to which the callback will be attached. The trainer's evaluation dataset must include a `"prompt"`
+            column containing the prompts for generating completions. If the `Trainer` has a reference model (via the
+            `ref_model` attribute), it will use this reference model for generating the reference completions;
+            otherwise, it defaults to using the initial model.
+        generation_config ([`~transformers.GenerationConfig`], *optional*):
+            The generation config to use for generating completions.
+        num_prompts (`int`, *optional*):
+            The number of prompts to generate completions for. If not provided, defaults to the number of examples in
+            the evaluation dataset.
+        shuffle_order (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the order of the completions before judging.
+        use_soft_judge (`bool`, *optional*, defaults to `False`):
+            Whether to use a soft judge that returns a win probability between 0 and 1 for the first completion vs the
+            second.
+    """
+
+    def __init__(
+        self,
+        judge,
+        trainer: Trainer,
+        generation_config: GenerationConfig | None = None,
+        num_prompts: int | None = None,
+        shuffle_order: bool = True,
+        use_soft_judge: bool = False,
+    ):
+        self.judge = judge
+        self.trainer = trainer
+        self.shuffle_order = shuffle_order
+        self.generation_config = generation_config
+        self.ref_completions = []
+        self.use_soft_judge = use_soft_judge
+
+        if self.trainer.eval_dataset is None:
+            raise ValueError(
+                "Trainer must have an evaluation dataset to use the WinRateCallback."
+            )
+        self.eval_dataset = self.trainer.eval_dataset
+
+        if num_prompts is not None:
+            self.eval_dataset = self.eval_dataset.select(range(num_prompts))
+
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        # When the trainer is initialized, we generate completions for the reference model.
+        tokenizer = kwargs["processing_class"]
+        tokenizer.padding_side = "left"
+        accelerator = self.trainer.accelerator
+        # Use the reference model if available, otherwise use the initial model
+        model = getattr(self.trainer, "ref_model", None)
+        # At this point, there are two cases where `ref_model` is None:
+        # 1. The method doesn't require a reference model.
+        # 2. The method uses a reference model, but `ref_model` is set to None.
+        #    This occurs when using PEFT, where the reference model can be obtained by simply disabling the model's adapter.
+        #    In theory, we should disable the adapter here, but since it's zero-initialized at the start of training,
+        #    the model behaves identically with or without the adapter.
+        #    Therefore, there's no need to explicitly disable it at this point.
+        if model is None:
+            model = self.trainer.model_wrapped
+        with accelerator.split_between_processes(
+            self.eval_dataset["prompt"]
+        ) as prompts:
+            self.ref_completions = _generate_completions(
+                prompts,
+                model=model,
+                tokenizer=tokenizer,
+                accelerator=accelerator,
+                generation_config=self.generation_config,
+                batch_size=args.per_device_eval_batch_size,
+            )
+            # Compute initial win rate as a reference point
+            completions = list(
+                zip(self.ref_completions, self.ref_completions, strict=True)
+            )
+            if self.use_soft_judge:
+                ref_win_probs = self.judge.judge(
+                    prompts, completions, self.shuffle_order, return_scores=True
+                )
+                winner_indices = [0 if score > 0.5 else 1 for score in ref_win_probs]
+                ref_win_probs = gather_object(ref_win_probs)
+            else:
+                winner_indices = self.judge.judge(
+                    prompts, completions, self.shuffle_order
+                )
+            prompts = gather_object(prompts)
+            completions = gather_object(completions)
+            winner_indices = gather_object(winner_indices)
+
+        # Logging
+        if self.trainer.accelerator.is_main_process:
+            win_rate = sum(winner_idx == 1 for winner_idx in winner_indices) / len(
+                winner_indices
+            )
+            if self.use_soft_judge:
+                avg_win_prob = 1.0 - sum(ref_win_probs) / len(ref_win_probs)
+                self.trainer.log(
+                    {"eval_avg_win_prob": avg_win_prob, "eval_win_rate": win_rate}
+                )
+            else:
+                self.trainer.log({"eval_win_rate": win_rate})
+
+            if "wandb" in args.report_to:
+                if wandb.run is not None:
+                    df = _win_rate_completions_df(
+                        state=state,
+                        prompts=prompts,
+                        completions=completions,
+                        winner_indices=winner_indices,
+                    )
+                    wandb.log({"win_rate_completions": wandb.Table(dataframe=df)})
+
+            if "comet_ml" in args.report_to:
+                df = _win_rate_completions_df(
+                    state=state,
+                    prompts=prompts,
+                    completions=completions,
+                    winner_indices=winner_indices,
+                )
+                log_table_to_comet_experiment(
+                    name="win_rate_completions.csv",
+                    table=df,
+                )
+
+    def on_evaluate(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        # At every evaluation step, we generate completions for the model and compare them with the reference
+        # completions that have been generated at the beginning of training. We then compute the win rate and log it to
+        # the trainer.
+        tokenizer = kwargs["processing_class"]
+        tokenizer.padding_side = "left"
+        accelerator = self.trainer.accelerator
+        model = self.trainer.model_wrapped
+        with accelerator.split_between_processes(
+            self.eval_dataset["prompt"]
+        ) as prompts:
+            completions = _generate_completions(
+                prompts,
+                model=model,
+                tokenizer=tokenizer,
+                accelerator=accelerator,
+                generation_config=self.generation_config,
+                batch_size=args.per_device_eval_batch_size,
+            )
+
+            completions = list(zip(self.ref_completions, completions, strict=True))
+
+            if self.use_soft_judge:
+                ref_win_probs = self.judge.judge(
+                    prompts, completions, self.shuffle_order, return_scores=True
+                )
+                winner_indices = [0 if score > 0.5 else 1 for score in ref_win_probs]
+                ref_win_probs = gather_object(ref_win_probs)
+            else:
+                winner_indices = self.judge.judge(
+                    prompts, completions, self.shuffle_order
+                )
+            prompts = gather_object(prompts)
+            completions = gather_object(completions)
+            winner_indices = gather_object(winner_indices)
+
+        # Logging
+        if self.trainer.accelerator.is_main_process:
+            win_rate = sum(winner_idx == 1 for winner_idx in winner_indices) / len(
+                winner_indices
+            )
+            if self.use_soft_judge:
+                avg_win_prob = 1.0 - sum(ref_win_probs) / len(ref_win_probs)
+                self.trainer.log(
+                    {"eval_avg_win_prob": avg_win_prob, "eval_win_rate": win_rate}
+                )
+            else:
+                self.trainer.log({"eval_win_rate": win_rate})
+
+            if "wandb" in args.report_to:
+                if wandb.run is not None:
+                    df = _win_rate_completions_df(
+                        state=state,
+                        prompts=prompts,
+                        completions=completions,
+                        winner_indices=winner_indices,
+                    )
+                    wandb.log({"win_rate_completions": wandb.Table(dataframe=df)})
+
+            if "comet_ml" in args.report_to:
+                df = _win_rate_completions_df(
+                    state=state,
+                    prompts=prompts,
+                    completions=completions,
+                    winner_indices=winner_indices,
+                )
+                log_table_to_comet_experiment(
+                    name="win_rate_completions.csv",
+                    table=df,
+                )
+
+
+class LogCompletionsCallback(TrainerCallback):
+    r"""
+    A [`~transformers.TrainerCallback`] that logs completions to Weights & Biases and/or Comet.
+
+    Usage:
+    ```python
+    trainer = DPOTrainer(...)
+    completions_callback = LogCompletionsCallback(trainer=trainer)
+    trainer.add_callback(completions_callback)
+    ```
+
+    Args:
+        trainer (`Trainer`):
+            Trainer to which the callback will be attached. The trainer's evaluation dataset must include a `"prompt"`
+            column containing the prompts for generating completions.
+        generation_config ([`~transformers.GenerationConfig`], *optional*):
+            The generation config to use for generating completions.
+        num_prompts (`int`, *optional*):
+            The number of prompts to generate completions for. If not provided, defaults to the number of examples in
+            the evaluation dataset.
+        freq (`int`, *optional*):
+            The frequency at which to log completions. If not provided, defaults to the trainer's `eval_steps`.
+    """
+
+    def __init__(
+        self,
+        trainer: Trainer,
+        generation_config: GenerationConfig | None = None,
+        num_prompts: int | None = None,
+        freq: int | None = None,
+    ):
+        self.trainer = trainer
+        self.generation_config = generation_config
+        self.freq = freq
+        self.table = []
+        self._last_logged_step = -1
+
+        if self.trainer.eval_dataset is None:
+            raise ValueError(
+                "Trainer must have an evaluation dataset to use the LogCompletionsCallback."
+            )
+        self.eval_dataset = self.trainer.eval_dataset
+
+        if num_prompts is not None:
+            self.eval_dataset = self.eval_dataset.select(range(num_prompts))
+
+    def on_step_end(self, args, state, control, **kwargs):
+        # Only log once per step (this method may be called multiple times)
+        if state.global_step == self._last_logged_step:
+            return
+
+        # Only log every `freq` steps (if no `freq` is provided, log every `eval_steps` steps)
+        freq = self.freq or state.eval_steps
+        if state.global_step % freq != 0:
+            return
+
+        tokenizer = kwargs["processing_class"]
+        tokenizer.padding_side = "left"
+        accelerator = self.trainer.accelerator
+        model = self.trainer.model_wrapped
+        with accelerator.split_between_processes(
+            self.eval_dataset["prompt"]
+        ) as prompts:
+            prompts = [
+                maybe_apply_chat_template({"prompt": prompt}, tokenizer)["prompt"]
+                for prompt in prompts
+            ]
+            completions = _generate_completions(
+                prompts,
+                model=model,
+                tokenizer=tokenizer,
+                accelerator=accelerator,
+                generation_config=self.generation_config,
+                batch_size=args.per_device_eval_batch_size,
+            )
+            completions = gather_object(completions)
+            prompts = gather_object(prompts)
+
+        # Build the data to log
+        if self.trainer.accelerator.is_main_process:
+            global_step = [str(state.global_step)] * len(prompts)
+            data = list(zip(global_step, prompts, completions, strict=True))
+            self.table.extend(data)
+            table = pd.DataFrame(
+                columns=["step", "prompt", "completion"], data=self.table
+            )
+
+            if "wandb" in args.report_to:
+                wandb.log({"completions": table})
+
+            if "comet_ml" in args.report_to:
+                log_table_to_comet_experiment(
+                    name="completions.csv",
+                    table=table,
+                )
+
+        # Save the last logged step, so we don't log the same completions multiple times
+        self._last_logged_step = state.global_step
+
+
+class WeaveCallback(TrainerCallback):
+    r"""
+    A [`~transformers.TrainerCallback`] that logs traces and evaluations to W&B Weave. The callback uses
+    https://weave-docs.wandb.ai/guides/evaluation/evaluation_logger/ to log traces and evaluations at each evaluation
+    step.
+
+    Supports two modes based on the `scorers` parameter:
+    - **Tracing Mode** (when scorers=None): Logs predictions for data exploration and analysis
+    - **Evaluation Mode** (when scorers provided): Logs predictions with scoring and summary metrics
+
+    Both modes use Weave's EvaluationLogger for structured, consistent data logging.
+
+    The callback logs data during evaluation phases (`on_evaluate`) rather than training steps, making it more
+    efficient and semantically correct. It gracefully handles missing weave installation by logging warnings and
+    skipping weave-specific functionality. It also checks for existing weave clients before initializing new ones.
+
+    Usage:
+    ```python
+    # Tracing mode (just log predictions)
+    trainer = DPOTrainer(...)
+    weave_callback = WeaveTraceCallback(trainer=trainer)  # project_name optional
+    trainer.add_callback(weave_callback)
+
+    # Or specify a project name
+    weave_callback = WeaveTraceCallback(trainer=trainer, project_name="my-llm-training")
+    trainer.add_callback(weave_callback)
+
+
+    # Evaluation mode (log predictions + scores + summary)
+    def accuracy_scorer(prompt: str, completion: str) -> float:
+        # Your scoring logic here (metadata available via eval_attributes)
+        return score
+
+
+    weave_callback = WeaveTraceCallback(
+        trainer=trainer,
+        project_name="my-llm-training",  # optional and needed only if weave client is not initialized
+        scorers={"accuracy": accuracy_scorer},
+    )
+    trainer.add_callback(weave_callback)
+    ```
+
+    Args:
+        trainer (`Trainer`):
+            Trainer to which the callback will be attached. The trainer's evaluation dataset must include a `"prompt"`
+            column containing the prompts for generating completions.
+        project_name (`str`, *optional*):
+            Name of the Weave project where data will be logged. If not provided, will try to use existing weave client
+            or fall back to the active wandb run's project name. Raises an error if none of these are available.
+        scorers (`dict[str, Callable]`, *optional*):
+            Dictionary mapping scorer names to scorer functions. If `None`, operates in tracing mode (predictions
+            only). If provided, operates in evaluation mode (predictions + scores + summary). Scorer functions should
+            have signature: `scorer(prompt: str, completion: str) -> float | int`
+        generation_config ([`~transformers.GenerationConfig`], *optional*):
+            Generation config to use for generating completions.
+        num_prompts (`int` or `None`, *optional*):
+            Number of prompts to generate completions for. If not provided, defaults to the number of examples in the
+            evaluation dataset.
+        dataset_name (`str`, *optional*, defaults to `"eval_dataset"`):
+            Name for the dataset metadata in Weave.
+        model_name (`str`, *optional*):
+            Name for the model metadata in Weave. If not provided, attempts to extract from model config.
+    """
+
+    def __init__(
+        self,
+        trainer: Trainer,
+        project_name: str | None = None,
+        scorers: dict[str, callable] | None = None,
+        generation_config: GenerationConfig | None = None,
+        num_prompts: int | None = None,
+        dataset_name: str = "eval_dataset",
+        model_name: str | None = None,
+    ):
+        self.trainer = trainer
+        self.project_name = project_name
+        self.scorers = scorers or {}
+        self.generation_config = generation_config
+        self.dataset_name = dataset_name
+        self.model_name = model_name
+        self._last_logged_step = -1
+        self._weave_initialized = False
+        self._eval_logger = None
+
+        if self.trainer.eval_dataset is None:
+            raise ValueError(
+                "Trainer must have an evaluation dataset to use the WeaveCallback."
+            )
+        self.eval_dataset = self.trainer.eval_dataset
+
+        if num_prompts is not None:
+            self.eval_dataset = self.eval_dataset.select(range(num_prompts))
+
+    def _initialize_weave(self):
+        """Initialize Weave and EvaluationLogger if not already initialized."""
+        if not self._weave_initialized:
+            if not is_weave_available():
+                logger.warning(
+                    "Weave is not available. Please install weave to enable logging: `pip install weave`"
+                )
+                return
+
+            if wc := weave_client_context.get_weave_client():
+                self._weave_client = wc
+            else:
+                if self.project_name is None:
+                    if is_wandb_available():
+                        if wandb.run is not None:
+                            self.project_name = (
+                                wandb.run.entity + "/" + wandb.run.project
+                            )
+                            logger.info(
+                                f"Using project name from active wandb run: {self.project_name}"
+                            )
+
+                    if self.project_name is None:
+                        raise ValueError(
+                            "No existing Weave client found and no project_name provided. "
+                            "Please either initialize weave with `weave.init('project-name')`, "
+                            "provide a project_name to the `WeaveTraceCallback`, "
+                            "or ensure an active wandb run exists."
+                        )
+
+                self._weave_client = weave.init(self.project_name)
+                logger.info(f"Initialized Weave with project: {self.project_name}")
+
+            if self.model_name is None:
+                self.model_name = getattr(
+                    self.trainer.model_wrapped.config, "_name_or_path", "unknown_model"
+                )
+
+            self._EvaluationLogger = EvaluationLogger
+
+            self._weave_initialized = True
+
+    @property
+    def is_evaluation_mode(self) -> bool:
+        """True if scorers are provided (evaluation mode), False for tracing mode."""
+        return bool(self.scorers)
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        """Initialize Weave when training begins."""
+        self._initialize_weave()
+
+    def on_evaluate(self, args, state, control, **kwargs):
+        if state.global_step == self._last_logged_step:
+            return
+
+        self._initialize_weave()
+
+        if not self._weave_initialized:
+            logger.debug("Weave not initialized, skipping logging")
+            return
+
+        tokenizer = kwargs["processing_class"]
+        tokenizer.padding_side = "left"
+        accelerator = self.trainer.accelerator
+        model = self.trainer.model_wrapped
+
+        with accelerator.split_between_processes(
+            self.eval_dataset["prompt"]
+        ) as prompts:
+            prompts = [
+                maybe_apply_chat_template({"prompt": prompt}, tokenizer)["prompt"]
+                for prompt in prompts
+            ]
+
+            completions = _generate_completions(
+                prompts=prompts,
+                model=model,
+                tokenizer=tokenizer,
+                accelerator=accelerator,
+                generation_config=self.generation_config,
+                batch_size=args.per_device_eval_batch_size,
+            )
+
+            all_prompts = gather_object(prompts)
+            all_completions = gather_object(completions)
+
+        if self.trainer.accelerator.is_main_process:
+            eval_attributes = {
+                "training_step": state.global_step,
+                "model_name": self.model_name,
+                "generation_config": (
+                    self.generation_config.to_dict() if self.generation_config else None
+                ),
+            }
+
+            eval_logger = self._EvaluationLogger(
+                model=self.model_name,
+                dataset=self.dataset_name,
+                eval_attributes=eval_attributes,
+            )
+
+            successful_predictions = 0
+            total_score_values = {}  # For summary statistics
+
+            for prompt, completion in zip(all_prompts, all_completions, strict=True):
+                try:
+                    pred_logger = eval_logger.log_prediction(
+                        inputs={"prompt": prompt}, output=completion
+                    )
+
+                    if self.is_evaluation_mode:
+                        for scorer_name, scorer_func in self.scorers.items():
+                            try:
+                                score = scorer_func(prompt, completion)
+                                pred_logger.log_score(scorer=scorer_name, score=score)
+
+                                if scorer_name not in total_score_values:
+                                    total_score_values[scorer_name] = []
+                                total_score_values[scorer_name].append(score)
+
+                            except Exception as scorer_e:
+                                logger.warning(
+                                    f"Failed to apply scorer '{scorer_name}': {scorer_e}"
+                                )
+
+                    pred_logger.finish()
+                    successful_predictions += 1
+
+                except Exception as pred_e:
+                    logger.warning(f"Failed to log prediction for prompt: {pred_e}")
+                    # Continue with other predictions even if one fails
+
+            if self.is_evaluation_mode and total_score_values:
+                try:
+                    summary_stats = {
+                        "total_predictions": len(all_prompts),
+                        "successful_predictions": successful_predictions,
+                    }
+
+                    for scorer_name, scores in total_score_values.items():
+                        if scores:  # Only if we have valid scores
+                            summary_stats[f"avg_{scorer_name}"] = sum(scores) / len(
+                                scores
+                            )
+
+                    eval_logger.log_summary(summary_stats)
+
+                except Exception as summary_e:
+                    logger.warning(f"Failed to log summary: {summary_e}")
+            else:
+                try:
+                    eval_logger.finish()
+                except Exception as finish_e:
+                    logger.warning(f"Failed to finish evaluation logger: {finish_e}")
+
+        self._last_logged_step = state.global_step
+
+
+class MergeModelCallback(TrainerCallback):
+    r"""
+    A [`~transformers.TrainerCallback`] that merges the policy model (the model being trained) with another model based
+    on a merge configuration.
+
+    Args:
+        merge_config ([`MergeConfig`], *optional*):
+            Configuration used for the merging process. If not provided, the default [`MergeConfig`] is used.
+        merge_at_every_checkpoint (`bool`, *optional*, defaults to `False`):
+            Whether to merge the model at every checkpoint.
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether to push the merged model to the Hub after merging.
+
+    Example:
+
+    ```python
+    from trl.mergekit_utils import MergeConfig
+    from trl import MergeModelCallback
+
+    config = MergeConfig()
+    merge_callback = MergeModelCallback(config)
+    trainer = DPOTrainer(..., callbacks=[merge_callback])
+    ```
+    """
+
+    def __init__(
+        self,
+        merge_config: "MergeConfig | None" = None,
+        merge_at_every_checkpoint: bool = False,
+        push_to_hub: bool = False,
+    ):
+        if not is_mergekit_available():
+            raise ImportError(
+                "MergeModelCallback requires the `mergekit` extra. To install, run `pip install mergekit`."
+            )
+        self.merge_config = merge_config or MergeConfig()
+        self.merge_at_every_checkpoint = merge_at_every_checkpoint
+        self.push_to_hub = push_to_hub
+
+    def _merge_and_maybe_push(self, output_dir, global_step, model):
+        checkpoint_path = os.path.join(output_dir, f"checkpoint-{global_step}")
+        self.merge_config.policy_model_path = checkpoint_path
+        if self.merge_config.target_model_path is None:
+            self.merge_config.target_model_path = get_config_model_id(model.config)
+        merge_path = os.path.join(checkpoint_path, "merged")
+
+        merge_models(self.merge_config.create(), merge_path)
+
+        if self.push_to_hub:
+            repo_name = f"{output_dir}_checkpoint-{global_step}_merged"
+            upload_model_to_hf(merge_path, repo_name)
+
+    def on_save(self, args, state, control, model=None, **kwargs):
+        if self.merge_at_every_checkpoint:
+            self._merge_and_maybe_push(args.output_dir, state.global_step, model)
+
+    def on_train_end(self, args, state, control, model=None, **kwargs):
+        if not self.merge_at_every_checkpoint:
+            self._merge_and_maybe_push(args.output_dir, state.global_step, model)
+
+
+class BEMACallback(TrainerCallback):
+    # docstyle-ignore
+    r"""
+    A [`~transformers.TrainerCallback`] that implements [BEMA](https://huggingface.co/papers/2508.00180)
+    (Bias-Corrected Exponential Moving Average) by [Adam Block](https://huggingface.co/abblock) and [Cyril
+    Zhang](https://huggingface.co/cyrilzhang). Code from https://github.com/abblock/bema under MIT license.
+
+    BEMA computes model weights that scale like:
+
+    $$
+    \theta_t' = \alpha_t \cdot (\theta_t - \theta_0) + \text{EMA}_t
+    $$
+
+    where  \\( \theta_t \\) is the current model weights,  \\( \theta_0 \\) is a snapshot of the model weights at the
+    first `update_after` step,  \\( \text{EMA}_t  \\) is the exponential moving average of the model weights, and
+     \\( \alpha_t \\) is a scaling factor that decays with the number of steps  \\( t \\) as
+
+    $$
+    \alpha_t = (\rho + \gamma \cdot t)^{-\eta}.
+    $$
+
+    The EMA is computed as:
+
+    $$
+    \text{EMA}_t = (1 - \beta_t) \cdot \text{EMA}_{t-1} + \beta_t \cdot \theta_t
+    $$
+
+    where  \\( \beta_t \\) is a decay factor that decays with the number of steps  \\( t \\) as
+
+    $$
+    \beta_t = (\rho + \gamma \cdot t)^{-\kappa}.
+    $$
+
+    Args:
+        update_freq (`int`, *optional*, defaults to `400`):
+            Update the BEMA weights every X steps. Denoted this as  \\( \phi \\) in the paper.
+        ema_power (`float`, *optional*, defaults to `0.5`):
+            Power for the EMA decay factor. Denoted  \\( \kappa \\) in the paper. To disable EMA, set this to `0.0`.
+        bias_power (`float`, *optional*, defaults to `0.2`):
+            Power for the BEMA scaling factor. Denoted  \\( \eta \\) in the paper. To disable BEMA, set this to `0.0`.
+        lag (`int`, *optional*, defaults to `10`):
+            Initial offset in the weight decay schedule that controls early-stage smoothness by acting as a virtual
+            starting age for the updates. Denoted as  \\( \rho \\) in the paper.
+        update_after (`int`, *optional*, defaults to `0`):
+            Burn-in time before starting to update the BEMA weights. Denoted  \\( \tau \\) in the paper.
+        multiplier (`float`, *optional*, defaults to `1.0`):
+            Initial value for the EMA decay factor. Denoted as  \\( \gamma \\) in the paper.
+        min_ema_multiplier (`float`, *optional*, defaults to `0.0`):
+            Minimum value for the EMA decay factor.
+        device (`str`, *optional*, defaults to `"cpu"`):
+            Device to use for the BEMA buffers, e.g. `"cpu"` or `"cuda"`. Note that in most cases, this device SHOULD
+            BE DIFFERENT from the device used for training in order to avoid OOM.
+
+    Example:
+
+    ```python
+    from trl import BEMACallback
+
+    trainer = Trainer(..., callbacks=[BEMACallback()])
+    ```
+    """
+
+    def __init__(
+        self,
+        update_freq: int = 400,
+        ema_power: float = 0.5,
+        bias_power: float = 0.2,
+        lag: int = 10,
+        update_after: int = 0,
+        multiplier: float = 1.0,
+        min_ema_multiplier: float = 0.0,
+        device: str = "cpu",
+    ):
+        # User-provided hyperparams
+        self.update_freq = update_freq
+        self.ema_power = ema_power
+        self.bias_power = bias_power
+        self.lag = lag
+        self.update_after = update_after
+        self.multiplier = multiplier
+        self.min_ema_multiplier = min_ema_multiplier
+        self.device = device
+
+        # Internal state
+        self.param_names = []  # references to training model param names
+        self.thetat_params = []  # references to training model params
+        self.theta0_params = []  # θ₀ buffers (on self.device)
+        self.ema_params = []  # EMA buffers (on self.device)
+        self.running_model = None  # a copy of the model to run BEMA on
+
+    @staticmethod
+    def _unwrap_model(model):
+        """
+        Helper function to unwrap model from various wrappers including DataParallel, DistributedDataParallel,
+        DeepSpeed, and FSDP.
+        """
+        # Handle DeepSpeed
+        if hasattr(model, "module") and hasattr(model, "engine"):
+            # DeepSpeed engine
+            return model.module
+
+        # Handle FSDP
+        if hasattr(model, "_fsdp_wrapped_module"):
+            # FSDP wrapped model
+            return model._fsdp_wrapped_module
+
+        # Handle DataParallel/DistributedDataParallel
+        if hasattr(model, "module"):
+            return model.module
+
+        return model
+
+    @torch.no_grad()
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: PreTrainedModel,
+        **kwargs,
+    ):
+        model = self._unwrap_model(model)
+
+        # Create a new instance and load state_dict
+        self.running_model = type(model)(model.config).to(self.device)
+        self.running_model.load_state_dict(model.state_dict())
+
+        # Cache trainable parameters once in a fixed order
+        for name, param in model.named_parameters():
+            if not param.requires_grad:
+                continue
+            self.param_names.append(name)
+            self.thetat_params.append(param)
+
+            # Clone θ₀ and EMA on the same device as model
+            theta0 = param.detach().clone().to(self.device)
+            self.theta0_params.append(theta0)
+            self.ema_params.append(theta0.clone())  # initialize EMA with θ₀
+
+    def _ema_beta(self, step: int) -> float:
+        """Compute the EMA decay factor βₜ = (ρ + γ·t)⁻ᵏᵃᵖᵖᵃ."""
+        beta = (self.lag + self.multiplier * step) ** (-self.ema_power)
+        return max(beta, self.min_ema_multiplier)
+
+    def _bema_alpha(self, step: int) -> float:
+        """Compute the BEMA scaling factor αₜ = (ρ + γ·t)⁻ᵉᵗᵃ."""
+        return (self.lag + self.multiplier * step) ** (-self.bias_power)
+
+    def _update_bema_weights(self, step: int):
+        beta = self._ema_beta(step)
+        alpha = self._bema_alpha(step)
+
+        # Compute EMA + BEMA in-place and write directly to running_model
+        for thetat, theta0, ema, run_param in zip(
+            self.thetat_params,
+            self.theta0_params,
+            self.ema_params,
+            self.running_model.parameters(),
+            strict=True,
+        ):
+            thetat = thetat.detach().to(self.device)
+            ema.mul_(1 - beta).add_(
+                thetat, alpha=beta
+            )  # EMA update: ema = (1 - beta) * ema + beta * θₜ
+            run_param.copy_(
+                ema + alpha * (thetat - theta0)
+            )  # BEMA update: run_param = ema + alpha * (θₜ - θ₀)
+
+    @torch.no_grad()
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: PreTrainedModel,
+        **kwargs,
+    ):
+        step = state.global_step
+
+        # If we haven't reached the update_after step, skip the BEMA update
+        if step < self.update_after:
+            return
+
+        # Snapshot θ₀ and EMA at first update
+        if step == self.update_after:
+            for thetat_param, theta0_param, ema_param in zip(
+                self.thetat_params, self.theta0_params, self.ema_params, strict=True
+            ):
+                theta0_param.copy_(thetat_param)
+                ema_param.copy_(thetat_param)
+
+        # Update BEMA weights every `update_freq` steps
+        elif (step - self.update_after) % self.update_freq == 0:
+            self._update_bema_weights(step)
+            logger.info(f"Updated BEMA weights at step {step}")
+
+    @torch.no_grad()
+    def on_train_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if state.is_world_process_zero:
+            save_directory = f"{args.output_dir}/bema"
+            self.running_model.save_pretrained(save_directory)
+            logger.info(f"Saved BEMA model to {save_directory}")
diff --git a/src/aixpert/training/training/trl/trainer/cpo_config.py b/src/aixpert/training/training/trl/trainer/cpo_config.py
new file mode 100644
index 0000000..f554e02
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/cpo_config.py
@@ -0,0 +1,227 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class CPOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`CPOTrainer`].
+
+    This class includes only the parameters that are specific to CPO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_completion_length (`int`, *optional*):
+            Maximum length of the completion. This argument is required if you want to use the default data collator
+            and your model is an encoder-decoder.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
+            the [paper](https://huggingface.co/papers/2310.12036).
+        label_smoothing (`float`, *optional*, defaults to `0.0`):
+            Label smoothing factor. This argument is required if you want to use the default data collator.
+        loss_type (`str`, *optional*, defaults to `"sigmoid"`):
+            Type of loss to use. Possible values are:
+
+                - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+                - `"hinge"`: hinge loss on the normalized likelihood from the
+                  [SLiC](https://huggingface.co/papers/2305.10425) paper.
+                - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+                - `"simpo"`: SimPO loss from the [SimPO](https://huggingface.co/papers/2405.14734) paper.
+                - `"alphapo"`: AlphaPO loss from the [AlphaPO](https://huggingface.co/papers/2501.03884) paper. This
+                  automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`.
+
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        cpo_alpha (`float`, *optional*, defaults to `1.0`):
+            Weight of the BC regularizer in CPO training.
+        simpo_gamma (`float`, *optional*, defaults to `0.5`):
+            Target reward margin for the SimPO loss, used only when the `loss_type="simpo"`.
+        alpha (`float`, *optional*, defaults to `0.0`):
+            Alpha parameter that controls reward function shape across all loss types. When alpha=0 (default), uses
+            standard log probability rewards. When `alpha != 0`, applies AlphaPO transformation: `r = (1 - p^(-alpha))
+            / alpha` from the [AlphaPO paper](https://huggingface.co/papers/2501.03884). This parameter works with all
+            loss types.
+        label_pad_token_id (`int`, *optional*, defaults to `-100`):
+            Label pad token id. This argument is required if you want to use the default data collator.
+        padding_value (`int`, *optional*):
+            Padding value to use. If `None`, the padding value of the tokenizer is used.
+        truncation_mode (`str`,*optional*,  defaults to `"keep_end"`):
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
+        is_encoder_decoder (`bool`, *optional*):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        dataset_num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    max_length: int | None = field(
+        default=1024,
+        metadata={
+            "help": "Maximum length of the sequences (prompt + completion) in the batch."
+        },
+    )
+    max_prompt_length: int | None = field(
+        default=512,
+        metadata={
+            "help": "Maximum length of the prompt. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    max_completion_length: int | None = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from "
+            "the reference model."
+        },
+    )
+    label_smoothing: float = field(
+        default=0.0,
+        metadata={"help": "Label smoothing factor."},
+    )
+    loss_type: str = field(
+        default="sigmoid",
+        metadata={
+            "help": "Type of loss to use.",
+            "choices": ["sigmoid", "hinge", "ipo", "simpo", "alphapo"],
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+    cpo_alpha: float = field(
+        default=1.0,
+        metadata={"help": "Weight of the BC regularizer in CPO training."},
+    )
+    simpo_gamma: float = field(
+        default=0.5,
+        metadata={
+            "help": "Target reward margin for the SimPO loss, used only when the `loss_type='simpo'`."
+        },
+    )
+    alpha: float = field(
+        default=0.0,
+        metadata={
+            "help": "Alpha parameter that controls reward function shape across all loss types. When alpha=0 "
+            "(default), uses standard log probability rewards. When `alpha != 0`, applies AlphaPO transformation: "
+            "`r = (1 - p^(-alpha)) / alpha` from the AlphaPO paper. This parameter works with all loss types."
+        },
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={"help": "Label pad token id."},
+    )
+    padding_value: int | None = field(
+        default=None,
+        metadata={
+            "help": "Padding value to use. If `None`, the padding value of the tokenizer is used."
+        },
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long.",
+            "choices": ["keep_end", "keep_start"],
+        },
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={
+            "help": "If `True`, generates and logs completions from the model to W&B during evaluation."
+        },
+    )
+    is_encoder_decoder: bool | None = field(
+        default=None,
+        metadata={"help": "Whether the model is an encoder-decoder model."},
+    )
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        # Syntactic sugar for AlphaPO: set loss_type to "simpo" and cpo_alpha to 0.0
+        if self.loss_type == "alphapo":
+            self.loss_type = "simpo"
+            self.cpo_alpha = 0.0
+
+        super().__post_init__()
diff --git a/src/aixpert/training/training/trl/trainer/cpo_trainer.py b/src/aixpert/training/training/trl/trainer/cpo_trainer.py
new file mode 100644
index 0000000..1a9c480
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/cpo_trainer.py
@@ -0,0 +1,1278 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import random
+import textwrap
+import warnings
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import nullcontext
+from pathlib import Path
+from typing import Any, Literal
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from accelerate import PartialState, logging
+from datasets import Dataset
+from torch import autocast, nn
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoModelForCausalLM,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    is_comet_available,
+    is_wandb_available,
+)
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalLoopOutput
+from transformers.utils import is_peft_available, is_torch_fx_proxy
+
+from ..data_utils import maybe_apply_chat_template, maybe_extract_prompt
+from .base_trainer import BaseTrainer
+from .cpo_config import CPOConfig
+from .utils import (
+    DPODataCollatorWithPadding,
+    add_bos_token_if_needed,
+    add_eos_token_if_needed,
+    disable_dropout_in_model,
+    log_table_to_comet_experiment,
+    pad_to_length,
+    peft_module_casting_to_bf16,
+    selective_log_softmax,
+)
+
+
+if is_peft_available():
+    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
+
+
+if is_wandb_available():
+    import wandb
+
+
+logger = logging.get_logger(__name__)
+
+
+class CPOTrainer(BaseTrainer):
+    r"""
+    Initialize CPOTrainer.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an [`~transformers.AutoModelForSequenceClassification`].
+        args ([`CPOConfig`]):
+            The CPO config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+    """
+
+    _tag_names = ["trl", "cpo"]
+    _name = "CPO"
+    _paper = {
+        "title": "Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation",
+        "id": "2401.08417",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{xu2024contrastive,
+                title        = {{Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation}},
+                author       = {Haoran Xu and Amr Sharaf and Yunmo Chen and Weiting Tan and Lingfeng Shen and Benjamin Van Durme and Kenton Murray and Young Jin Kim},
+                year         = 2024,
+                booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
+                publisher    = {OpenReview.net},
+                url          = {https://openreview.net/forum?id=51iwkioZpn}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str | None = None,
+        args: CPOConfig | None = None,
+        data_collator: DataCollator | None = None,
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        model_init: Callable[[], PreTrainedModel] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
+    ):
+        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+            warnings.warn(
+                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
+                "it and want it to remain, please share your comments here: "
+                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
+                "TRL_EXPERIMENTAL_SILENCE=1."
+            )
+        if args.model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError(
+                "You passed model_kwargs to the CPOTrainer. But your model is already instantiated."
+            )
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            dtype = model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the CPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                model_init_kwargs["dtype"] = dtype
+
+        if isinstance(model, str):
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        if is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(
+                model, "is_loaded_in_4bit", False
+            ):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {
+                    "use_gradient_checkpointing": args.gradient_checkpointing
+                }
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = (
+                        args.gradient_checkpointing_kwargs
+                    )
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+
+                    model.get_input_embeddings().register_forward_hook(
+                        make_inputs_require_grad
+                    )
+
+            # get peft model with the given config
+            model = get_peft_model(model, peft_config)
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(
+                    make_inputs_require_grad
+                )
+
+        if args.generate_during_eval and not (
+            is_wandb_available() or is_comet_available()
+        ):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif args.is_encoder_decoder is None:
+            raise ValueError(
+                "When no model is provided, you need to pass the parameter is_encoder_decoder."
+            )
+        else:
+            self.is_encoder_decoder = args.is_encoder_decoder
+
+        if self.is_encoder_decoder:
+            self.decoder_start_token_id = model.config.decoder_start_token_id
+            self.pad_token_id = model.config.pad_token_id
+
+        if processing_class is None:
+            raise ValueError(
+                "processing_class must be specified to tokenize a CPO dataset."
+            )
+        if args.max_length is None:
+            logger.warning(
+                "`max_length` is not set in the CPOConfig's init"
+                " it will default to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        else:
+            max_length = args.max_length
+        if args.max_prompt_length is None:
+            logger.warning(
+                "`max_prompt_length` is not set in the CPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            max_prompt_length = 128
+        else:
+            max_prompt_length = args.max_prompt_length
+
+        if not max_prompt_length < max_length:
+            raise ValueError(
+                f"max_prompt_length ({max_prompt_length}) should be strictly less than max_length ({max_length})."
+            )
+
+        if args.max_completion_length is None and self.is_encoder_decoder:
+            logger.warning(
+                "When using an encoder decoder architecture, you should set `max_completion_length` in the CPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            max_completion_length = 128
+        else:
+            max_completion_length = args.max_completion_length
+
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+                label_pad_token_id=args.label_pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.padding_value = (
+            args.padding_value
+            if args.padding_value is not None
+            else processing_class.pad_token_id
+        )
+        self.max_prompt_length = max_prompt_length
+        self.truncation_mode = args.truncation_mode
+        self.max_completion_length = max_completion_length
+        self.processing_class = processing_class
+
+        if args.loss_type in ["hinge", "ipo"] and args.label_smoothing > 0:
+            logger.warning(
+                f"You are using the {args.loss_type} loss type that does not support label smoothing. The "
+                "`label_smoothing` parameter will be ignored. Set `label_smoothing` to `0.0` to remove this warning.",
+            )
+        if args.loss_type == "kto_pair":
+            raise ValueError(
+                "Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer."
+            )
+
+        self.beta = args.beta
+        self.label_smoothing = args.label_smoothing
+        self.loss_type = args.loss_type
+        self.cpo_alpha = args.cpo_alpha
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+
+        if args.loss_type == "simpo":
+            self.simpo_gamma = args.simpo_gamma
+
+        # AlphaPO parameter for reward shaping
+        self.alpha = args.alpha
+
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in CPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and
+        # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens
+        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
+        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
+        # that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Compute that only on the main process for faster data processing.
+        # see: https://github.com/huggingface/trl/pull/1255
+        with PartialState().main_process_first():
+            # Extract the prompt if needed, and apply the chat template if needed
+            train_dataset = train_dataset.map(
+                maybe_extract_prompt, num_proc=args.dataset_num_proc
+            )
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template,
+                fn_kwargs={"tokenizer": processing_class},
+                num_proc=args.dataset_num_proc,
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    maybe_extract_prompt, num_proc=args.dataset_num_proc
+                )
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                )
+
+            # tokenize the dataset
+            train_dataset = train_dataset.map(
+                self.tokenize_row, num_proc=args.dataset_num_proc
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    self.tokenize_row, num_proc=args.dataset_num_proc
+                )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+    def build_tokenized_answer(self, prompt, answer):
+        """
+        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a +
+        b)[len(enc(a)):]`. Reference:
+            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+        """
+        full_tokenized = self.processing_class(
+            prompt + answer, add_special_tokens=False
+        )
+        prompt_input_ids = self.processing_class(prompt, add_special_tokens=False)[
+            "input_ids"
+        ]
+
+        answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
+        answer_attention_mask = full_tokenized["attention_mask"][
+            len(prompt_input_ids) :
+        ]
+
+        # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
+        full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids])
+
+        # Prepare input tokens for token by token comparison
+        full_input_ids = np.array(full_tokenized["input_ids"])
+
+        if len(full_input_ids) != len(full_concat_input_ids):
+            raise ValueError(
+                "Prompt input ids and answer input ids should have the same length."
+            )
+
+        # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
+        # can be merged together when tokenizing prompt+answer. This could result
+        # on the last token from the prompt being different when tokenized on its own
+        # vs when done as prompt+answer.
+        response_token_ids_start_idx = len(prompt_input_ids)
+
+        # If tokenized prompt is different than both prompt+answer, then it means the
+        # last token has changed due to merging.
+        if (
+            prompt_input_ids
+            != full_tokenized["input_ids"][:response_token_ids_start_idx]
+        ):
+            response_token_ids_start_idx -= 1
+
+        prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
+        prompt_attention_mask = full_tokenized["attention_mask"][
+            :response_token_ids_start_idx
+        ]
+
+        if len(prompt_input_ids) != len(prompt_attention_mask):
+            raise ValueError(
+                "Prompt input ids and attention mask should have the same length."
+            )
+
+        answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
+        answer_attention_mask = full_tokenized["attention_mask"][
+            response_token_ids_start_idx:
+        ]
+
+        return dict(
+            prompt_input_ids=prompt_input_ids,
+            prompt_attention_mask=prompt_attention_mask,
+            input_ids=answer_input_ids,
+            attention_mask=answer_attention_mask,
+        )
+
+    def tokenize_row(
+        self, feature, model: PreTrainedModel | nn.Module | None = None
+    ) -> dict:
+        """Tokenize a single row from a CPO specific dataset.
+
+        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
+        chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long,
+        we truncate the chosen/rejected.
+
+        We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length
+        of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens.
+        """
+        batch = {}
+        prompt = feature["prompt"]
+        chosen = feature["chosen"]
+        rejected = feature["rejected"]
+
+        if not self.is_encoder_decoder:
+            # Check issues below for more details
+            #  1. https://github.com/huggingface/trl/issues/907
+            #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+            #  3. https://github.com/LianjiaTech/BELLE/issues/337
+
+            if not isinstance(prompt, str):
+                raise ValueError(f"prompt should be an str but got {type(prompt)}")
+            prompt_tokens = self.processing_class(prompt, add_special_tokens=False)
+            prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
+
+            if not isinstance(chosen, str):
+                raise ValueError(f"chosen should be an str but got {type(chosen)}")
+            chosen_tokens = self.build_tokenized_answer(prompt, chosen)
+
+            if not isinstance(rejected, str):
+                raise ValueError(f"rejected should be an str but got {type(rejected)}")
+            rejected_tokens = self.build_tokenized_answer(prompt, rejected)
+
+            # Last prompt token might get merged by tokenizer and
+            # it should not be included for generation if that happens
+            prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"])
+
+            chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"])
+            rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"])
+            prompt_len_input_ids = min(
+                chosen_prompt_len_input_ids, rejected_prompt_len_input_ids
+            )
+
+            for k, v in prompt_tokens.items():
+                prompt_tokens[k] = v[:prompt_len_input_ids]
+
+            # Make sure prompts only have one different token at most an
+            # and length only differs by 1 at most
+            num_diff_tokens = sum(
+                a != b
+                for a, b in zip(
+                    chosen_tokens["prompt_input_ids"],
+                    rejected_tokens["prompt_input_ids"],
+                    strict=True,
+                )
+            )
+            num_diff_len = abs(
+                chosen_prompt_len_input_ids - rejected_prompt_len_input_ids
+            )
+            if num_diff_tokens > 1 or num_diff_len > 1:
+                raise ValueError(
+                    "Chosen and rejected prompt_input_ids might only differ on the "
+                    "last token due to tokenizer merge ops."
+                )
+
+            # add BOS token to head of prompt. Avoid adding if it's already there
+            prompt_tokens, chosen_tokens, rejected_tokens = add_bos_token_if_needed(
+                self.processing_class.bos_token_id,
+                prompt_len_input_ids,
+                prompt_tokens,
+                chosen_prompt_len_input_ids,
+                chosen_tokens,
+                rejected_prompt_len_input_ids,
+                rejected_tokens,
+            )
+
+            # add EOS token to end of answer. Avoid adding if it's already there
+            chosen_tokens, rejected_tokens = add_eos_token_if_needed(
+                self.processing_class.eos_token_id, chosen_tokens, rejected_tokens
+            )
+
+            longer_response_length = max(
+                len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"])
+            )
+
+            # if combined sequence is too long, truncate the prompt
+            for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]:
+                if (
+                    len(answer_tokens["prompt_input_ids"]) + longer_response_length
+                    > self.max_length
+                ):
+                    if self.truncation_mode == "keep_start":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][
+                                : self.max_prompt_length
+                            ]
+                    elif self.truncation_mode == "keep_end":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][
+                                -self.max_prompt_length :
+                            ]
+                    else:
+                        raise ValueError(
+                            f"Unknown truncation mode: {self.truncation_mode}"
+                        )
+
+            # if that's still too long, truncate the response
+            for answer_tokens in [chosen_tokens, rejected_tokens]:
+                if (
+                    len(answer_tokens["prompt_input_ids"]) + longer_response_length
+                    > self.max_length
+                ):
+                    for k in ["input_ids", "attention_mask"]:
+                        answer_tokens[k] = answer_tokens[k][
+                            : self.max_length - self.max_prompt_length
+                        ]
+
+            # Create labels
+            chosen_sequence_tokens = {
+                k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k]
+                for k in ["input_ids", "attention_mask"]
+            }
+            rejected_sequence_tokens = {
+                k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k]
+                for k in ["input_ids", "attention_mask"]
+            }
+            chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:]
+            chosen_sequence_tokens["labels"][
+                : len(chosen_tokens["prompt_input_ids"])
+            ] = [self.label_pad_token_id] * len(chosen_tokens["prompt_input_ids"])
+            rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][
+                :
+            ]
+            rejected_sequence_tokens["labels"][
+                : len(rejected_tokens["prompt_input_ids"])
+            ] = [self.label_pad_token_id] * len(rejected_tokens["prompt_input_ids"])
+
+            for k, toks in {
+                "chosen_": chosen_sequence_tokens,
+                "rejected_": rejected_sequence_tokens,
+                "": prompt_tokens,
+            }.items():
+                for type_key, tokens in toks.items():
+                    if type_key == "token_type_ids":
+                        continue
+                    batch[f"{k}{type_key}"] = tokens
+
+        else:
+            chosen_tokens = self.processing_class(
+                chosen,
+                truncation=True,
+                max_length=self.max_completion_length,
+                add_special_tokens=True,
+            )
+            rejected_tokens = self.processing_class(
+                rejected,
+                truncation=True,
+                max_length=self.max_completion_length,
+                add_special_tokens=True,
+            )
+            prompt_tokens = self.processing_class(
+                prompt,
+                truncation=True,
+                max_length=self.max_prompt_length,
+                add_special_tokens=True,
+            )
+
+            batch["chosen_labels"] = chosen_tokens["input_ids"]
+            batch["rejected_labels"] = rejected_tokens["input_ids"]
+            batch["prompt_input_ids"] = prompt_tokens["input_ids"]
+            batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]
+
+            if model is not None and hasattr(
+                model, "prepare_decoder_input_ids_from_labels"
+            ):
+                batch["rejected_decoder_input_ids"] = (
+                    model.prepare_decoder_input_ids_from_labels(
+                        labels=torch.tensor(batch["rejected_labels"])
+                    )
+                )
+                batch["chosen_decoder_input_ids"] = (
+                    model.prepare_decoder_input_ids_from_labels(
+                        labels=torch.tensor(batch["chosen_labels"])
+                    )
+                )
+
+        return batch
+
+    @staticmethod
+    def concatenated_inputs(
+        batch: dict[str, list | torch.LongTensor],
+        is_encoder_decoder: bool = False,
+        label_pad_token_id: int = -100,
+        padding_value: int = 0,
+        device: torch.device | None = None,
+    ) -> dict[str, torch.LongTensor]:
+        """Concatenate the chosen and rejected inputs into a single tensor.
+
+        Args:
+            batch:
+                A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors
+                of shape (batch_size, sequence_length).
+            is_encoder_decoder:
+                Whether the model is an encoder-decoder model.
+            label_pad_token_id:
+                The label pad token id.
+            padding_value:
+                The padding value to use for the concatenated inputs_ids.
+            device:
+                The device for the concatenated inputs.
+
+        Returns
+        -------
+            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
+        """
+        concatenated_batch = {}
+
+        if is_encoder_decoder:
+            max_length = max(
+                batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1]
+            )
+        else:
+            max_length = max(
+                batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1]
+            )
+
+        for k in batch:
+            if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("chosen", "concatenated")
+                concatenated_batch[concatenated_key] = pad_to_length(
+                    batch[k], max_length, pad_value=pad_value
+                )
+        for k in batch:
+            if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("rejected", "concatenated")
+                concatenated_batch[concatenated_key] = torch.cat(
+                    (
+                        concatenated_batch[concatenated_key],
+                        pad_to_length(batch[k], max_length, pad_value=pad_value),
+                    ),
+                    dim=0,
+                ).to(device=device)
+
+        if is_encoder_decoder:
+            concatenated_batch["concatenated_input_ids"] = (
+                batch["prompt_input_ids"].repeat(2, 1).to(device=device)
+            )
+            concatenated_batch["concatenated_attention_mask"] = (
+                batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
+            )
+
+        return concatenated_batch
+
+    def cpo_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Compute the CPO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns
+        -------
+            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the CPO
+            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
+            the chosen and rejected responses, respectively.
+        """
+        # Apply AlphaPO reward transformation if alpha != 0
+        if self.alpha != 0.0:
+            # Compute probabilities
+            chosen_probs = torch.exp(policy_chosen_logps)
+            rejected_probs = torch.exp(policy_rejected_logps)
+
+            # Apply AlphaPO transformation: r = (1 - p^(-alpha)) / alpha
+            policy_chosen_rewards = (1 - chosen_probs.pow(-self.alpha)) / self.alpha
+            policy_rejected_rewards = (1 - rejected_probs.pow(-self.alpha)) / self.alpha
+
+            logits = (policy_chosen_rewards - policy_rejected_rewards).to(
+                self.accelerator.device
+            )
+        else:
+            # Standard log probability rewards when alpha = 0
+            logits = (policy_chosen_logps - policy_rejected_logps).to(
+                self.accelerator.device
+            )
+
+        # The beta is a temperature parameter for the CPO loss, typically something in the range of 0.1 to 0.5.
+        # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the labels and
+        # calculates a conservative CPO loss.
+
+        if self.loss_type == "simpo":
+            gamma_logratios = self.simpo_gamma / self.beta
+            logits = logits - gamma_logratios
+            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        elif self.loss_type == "sigmoid":
+            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        elif self.loss_type == "hinge":
+            losses = torch.relu(1 - self.beta * logits)
+        elif self.loss_type == "ipo":
+            # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper.
+            losses = (logits - 1 / (2 * self.beta)) ** 2
+        else:
+            raise ValueError(
+                f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'simpo']"
+            )
+
+        # Calculate rewards for logging
+        if self.alpha != 0.0:
+            # When using AlphaPO transformation, use the transformed rewards
+            chosen_rewards = (
+                self.beta * policy_chosen_rewards.to(self.accelerator.device).detach()
+            )
+            rejected_rewards = (
+                self.beta * policy_rejected_rewards.to(self.accelerator.device).detach()
+            )
+        else:
+            # Standard log probability rewards
+            chosen_rewards = (
+                self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach()
+            )
+            rejected_rewards = (
+                self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach()
+            )
+
+        return losses, chosen_rewards, rejected_rewards
+
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        label_pad_token_id: int = -100,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
+                ignored. Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+            label_pad_token_id: The label pad token id.
+            is_encoder_decoder: Whether the model is an encoder-decoder model.
+
+        Returns
+        -------
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError(
+                "Logits (batch and sequence length dim) and labels must have the same shape."
+            )
+
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        loss_mask = labels != label_pad_token_id
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == label_pad_token_id] = 0
+
+        per_token_logps = selective_log_softmax(logits, labels)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        return (per_token_logps * loss_mask).sum(-1)
+
+    def concatenated_forward(
+        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
+    ) -> tuple[
+        torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor
+    ]:
+        """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+            padding_value=self.padding_value,
+            device=self.accelerator.device,
+        )
+        len_chosen = batch["chosen_labels"].shape[0]
+
+        model_kwargs = (
+            {
+                "decoder_input_ids": self._shift_right(
+                    concatenated_batch["concatenated_labels"]
+                ),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        outputs = model(
+            concatenated_batch["concatenated_input_ids"],
+            attention_mask=concatenated_batch["concatenated_attention_mask"],
+            use_cache=False,
+            **model_kwargs,
+        )
+        all_logits = outputs.logits
+
+        def cross_entropy_loss(logits, labels):
+            if not self.is_encoder_decoder:
+                # Shift so that tokens < n predict n
+                logits = logits[..., :-1, :].contiguous()
+                labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            logits = logits.view(-1, logits.shape[-1])
+            labels = labels.view(-1)
+            # Enable model parallelism
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits, labels)
+            return loss
+
+        labels = concatenated_batch["concatenated_labels"].clone()
+
+        if self.cpo_alpha == 0:
+            nll_loss = torch.tensor(0.0).to(self.accelerator.device)
+        else:
+            nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
+
+        all_logps = self.get_batch_logps(
+            all_logits,
+            concatenated_batch["concatenated_labels"],
+            average_log_prob=self.loss_type in ["ipo", "simpo"],
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        chosen_logps = all_logps[:len_chosen]
+        rejected_logps = all_logps[len_chosen:]
+
+        chosen_logits = all_logits[:len_chosen]
+        rejected_logits = all_logits[len_chosen:]
+
+        if self.aux_loss_enabled:
+            return (
+                chosen_logps,
+                rejected_logps,
+                chosen_logits,
+                rejected_logits,
+                nll_loss,
+                outputs.aux_loss,
+            )
+
+        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, nll_loss)
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, list | torch.LongTensor],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the CPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+
+        forward_output = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = forward_output[:5]
+        if self.aux_loss_enabled:
+            aux_loss = forward_output[5]
+
+        losses, chosen_rewards, rejected_rewards = self.cpo_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+        )
+
+        loss = losses.mean() + self.cpo_alpha * policy_nll_loss
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards).mean().item()
+        )
+        metrics[f"{prefix}rewards/rejected"] = (
+            self.accelerator.gather_for_metrics(rejected_rewards).mean().item()
+        )
+        metrics[f"{prefix}rewards/accuracies"] = (
+            self.accelerator.gather_for_metrics(reward_accuracies).mean().item()
+        )
+        metrics[f"{prefix}rewards/margins"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards)
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logps/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logps)
+            .detach()
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logps/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logps)
+            .detach()
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logits/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logits.detach().mean())
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logits/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logits.detach().mean())
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}nll_loss"] = (
+            self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean().item()
+        )
+
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+    def compute_loss(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(
+                model, inputs, train_eval="train"
+            )
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+
+    def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+
+        policy_output = pad_to_length(
+            policy_output, self.max_length, self.processing_class.pad_token_id
+        )
+        policy_output_decoded = self.processing_class.batch_decode(
+            policy_output, skip_special_tokens=True
+        )
+
+        return policy_output_decoded
+
+    def prediction_step(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        prediction_loss_only: bool,
+        ignore_keys: list[str] | None = None,
+    ):
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(
+                model, inputs, train_eval="eval"
+            )
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def store_metrics(
+        self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train"
+    ) -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: bool | None = None,
+        ignore_keys: list[str] | None = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(
+                range(num_samples), k=self.args.eval_batch_size
+            )
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded = self.generate_from_model(self.model, random_batch)
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy"],
+                data=[
+                    [prompt, pol[len(prompt) :]]
+                    for prompt, pol in zip(
+                        random_batch["prompt"], policy_output_decoded, strict=True
+                    )
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader,
+            description,
+            prediction_loss_only,
+            ignore_keys,
+            metric_key_prefix,
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float`, *optional*):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = torch.tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    def _shift_right(self, input_ids):
+        if self.decoder_start_token_id is None:
+            raise ValueError(
+                "model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(
+                input_ids.shape[:-1] + (1,), self.decoder_start_token_id
+            )
+            shifted_input_ids = torch.cat(
+                [shifted_input_ids, input_ids[..., :-1]], dim=-1
+            )
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = self.decoder_start_token_id
+
+        if self.pad_token_id is None:
+            raise ValueError("model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, self.pad_token_id)
+
+        return shifted_input_ids
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/trainer/dpo_config.py b/src/aixpert/training/training/trl/trainer/dpo_config.py
new file mode 100644
index 0000000..5b302ad
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/dpo_config.py
@@ -0,0 +1,549 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+
+from transformers import TrainingArguments
+
+
+class FDivergenceType(Enum):
+    """Types of f-divergence functions for DPO loss regularization.
+
+    Attributes
+    ----------
+        REVERSE_KL: Reverse KL divergence.
+        JS_DIVERGENCE: Jensen-Shannon divergence.
+        ALPHA_DIVERGENCE: Alpha divergence.
+
+    Examples
+    --------
+        ```python
+        >>> from trl.trainer.dpo_config import DPOConfig, FDivergenceType
+
+        >>> config = DPOConfig(
+        ...     f_divergence_type=FDivergenceType.ALPHA_DIVERGENCE,
+        ...     f_alpha_divergence_coef=0.5,  # used only with ALPHA_DIVERGENCE
+        ... )
+        ```
+    """
+
+    REVERSE_KL = "reverse_kl"
+    JS_DIVERGENCE = "js_divergence"
+    ALPHA_DIVERGENCE = "alpha_divergence"
+
+
+class FDivergenceConstants:
+    """Constants for f-divergence types and their parameters.
+
+    Attributes
+    ----------
+        ALPHA_DIVERGENCE_COEF_KEY (`str`): Key for the alpha divergence coefficient.
+        ALPHA_DIVERGENCE_COEF_DEFAULT (`float`): Default value for the alpha divergence coefficient.
+    """
+
+    ALPHA_DIVERGENCE_COEF_KEY = "alpha_divergence_coef"
+    ALPHA_DIVERGENCE_COEF_DEFAULT = 1.0
+
+
+@dataclass
+class DPOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`DPOTrainer`].
+
+    This class includes only the parameters that are specific to DPO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        > Parameters that control the model and reference model
+
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments for `AutoModelForCausalLM.from_pretrained`, used when the `model` argument of the
+            [`DPOTrainer`] is provided as a string.
+        ref_model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments for `AutoModelForCausalLM.from_pretrained`, used when the `ref_model` argument of the
+            [`DPOTrainer`] is provided as a string.
+        model_adapter_name (`str`, *optional*):
+            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+        ref_adapter_name (`str`, *optional*):
+            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+        force_use_ref_model (`bool`, *optional*, defaults to `False`):
+            If you provide a PEFT model as the active model and wish to use a different model for the `ref_model`, set
+            this flag to `True`.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model and reference model.
+        use_logits_to_keep (`bool`, *optional*, defaults to `False`):
+            If `True`, only a specified number of logits are computed in the forward pass. This can be useful for
+            saving memory and speeding up training by not computing the logits for all tokens, especially in scenarios
+            when working with very long prompts where labels are ignored (-100).
+
+        > Parameters that control the data preprocessing
+
+        dataset_num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+        pad_token (`str`, *optional*):
+            Token used for padding. If `None`, it defaults to `processing_class.pad_token`, or if that is also `None`,
+            it falls back to `processing_class.eos_token`.
+        label_pad_token_id (`int`, *optional*, defaults to `-100`):
+            Padding value to use for labels.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt.
+        max_completion_length (`int`, *optional*):
+            Maximum length of the completion.
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the full sequence (prompt + completion).
+        truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+            Truncation mode to use when the sequence exceeds `max_length`. Possible values are `"keep_end"` and
+            `"keep_start"`.
+        padding_free (`bool`, *optional*, defaults to `False`):
+            Whether to perform forward passes without padding by flattening all sequences in the batch into a single
+            continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this is only
+            supported with the `flash_attention_2` attention implementation, which can efficiently handle the flattened
+            batch structure.
+        precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
+            Whether to precompute the log probabilities from the reference model. Setting this to `True` allows
+            training without needing the reference model during training, which can help reduce GPU memory usage. If
+            set to `False` (default), the reference model will be used during training to compute log probabilities
+            on-the-fly.
+        precompute_ref_batch_size (`int`, *optional*):
+            Batch size to use when precomputing reference model log probabilities. This can be set higher than the
+            training batch size to speed up preprocessing. If `None`, defaults to `per_device_train_batch_size` for
+            training and `per_device_eval_batch_size` for evaluation.
+        tools (`list[dict] | None`, *optional*):
+            List of tools (callable functions) that will be accessible to the model. If the template does not support
+            function calling, this argument will have no effect.
+
+        > Parameters that control the training
+
+        loss_type (`str` or `list[str]`, *optional*, defaults to `"sigmoid"`):
+            Type of loss to use. Possible values are:
+
+                - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+                - `"hinge"`: hinge loss on the normalized likelihood from the
+                  [SLiC](https://huggingface.co/papers/2305.10425) paper.
+                - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+                - `"exo_pair"`: pairwise EXO loss from the [EXO](https://huggingface.co/papers/2402.00856) paper.
+                - `"nca_pair"`: pairwise NCA loss from the [NCA](https://huggingface.co/papers/2402.05369) paper.
+                - `"robust"`: unbiased estimate of the DPO loss that is robust to preference noise from the [Robust
+                  DPO](https://huggingface.co/papers/2403.00409) paper.
+                - `"bco_pair"`: pairwise BCO loss from the [BCO](https://huggingface.co/papers/2404.04656) paper.
+                - `"sppo_hard"`: SPPO loss with hard label from the [SPPO](https://huggingface.co/papers/2405.00675)
+                  paper.
+                - `"aot"`: AOT loss for paired datasets from the [AOT](https://huggingface.co/papers/2406.05882) paper.
+                - `"aot_pair"`: AOT loss for unpaired datasets from the [AOT](https://huggingface.co/papers/2406.05882)
+                  paper.
+                - `"discopop"`: DiscoPOP (a.k.a Log-Ratio Modulated Loss, LRML) loss from the
+                  [DiscoPOP](https://huggingface.co/papers/2406.08414) paper.
+                - `"apo_zero"`: APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
+                - `"apo_down"`: APO-down loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
+                - `"sft"`: Negative log-likelihood loss (standard supervised fine-tuning loss).
+
+            Multiple loss types can be combined using comma separation (e.g., `["sigmoid", "bco_pair", "sft"]` for
+            [MPO](https://huggingface.co/papers/2411.10442)). The `loss_weights` parameter can be used to specify
+            corresponding weights for each loss type.
+
+        use_liger_loss (`bool`, *optional*):
+            Whether to use Liger loss.
+
+            <Deprecated version="0.25.0">
+
+            Parameter `use_liger_loss` is deprecated and will be removed in version 0.28.0. Use `use_liger_kernel`
+            instead.
+
+            </Deprecated>
+
+        base_model_attribute_name (`str`, *optional*, defaults to `"model"`):
+            Name of the attribute in the model that contains the base model. This is used to get the base model from
+            the model when the model does not have a `get_decoder` method in the case when `use_liger_kernel` is
+            `True`.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
+            the [paper](https://huggingface.co/papers/2310.12036).
+        f_divergence_type ([`FDivergenceType`] or `str`, *optional*, defaults to `FDivergenceType.REVERSE_KL`):
+            Type of f-divergence regularization function to compute divergence between policy and reference model.
+        f_alpha_divergence_coef (`float`, *optional*, defaults to `1.0`):
+            α coefficient in the α-divergence u^-α regularization function for DPO loss.
+        reference_free (`bool`, *optional*, defaults to `False`):
+            Whether to ignore the provided reference model and implicitly use a reference model that assigns equal
+            probability to all responses.
+        label_smoothing (`float`, *optional*, defaults to `0.0`):
+            Robust DPO label smoothing parameter from the [cDPO report](https://ericmitchell.ai/cdpo.pdf) and [Robust
+            DPO](https://huggingface.co/papers/2403.00409) paper that should be between `0.0` and `0.5`.
+        use_weighting (`bool`, *optional*, defaults to `False`):
+            Whether to weight the loss as done in the [WPO paper](https://huggingface.co/papers/2406.11827).
+        rpo_alpha (`float`, *optional*):
+            α parameter from the [RPO paper](https://huggingface.co/papers/2404.19733) (v3), which controls the
+            weighting of the NLL term in the loss. If `None`, no weighting is applied and the loss is the same as the
+            DPO loss. The paper recommends `rpo_alpha=1.0`.
+        ld_alpha (`float`, *optional*):
+            α parameter from the [LD-DPO paper](https://huggingface.co/papers/2409.06411), which controls the weighting
+            of the verbose token log-probabilities in responses. If `None`, no weighting is applied to the verbose
+            part, and the loss is equivalent to the standard DPO loss. The paper recommends setting `ld_alpha` between
+            `0.0` and `1.0`.
+        discopop_tau (`float`, *optional*, defaults to `0.05`):
+            τ/temperature parameter from the [DiscoPOP](https://huggingface.co/papers/2406.08414) paper, which controls
+            the shape of log ratio modulated loss. The paper recommends the default value `discopop_tau=0.05`.
+        loss_weights (`list[float]`, *optional*):
+            List of loss weights for multi-loss combinations. Used when combining multiple loss types. Example: `[0.8,
+            0.2, 1.0]` for [MPO](https://huggingface.co/papers/2411.10442). If not provided, defaults to equal weights
+            (`1.0`) for all loss types.
+        sync_ref_model (`bool`, *optional*, defaults to `False`):
+            Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using
+            the `ref_model_mixup_alpha` parameter. This synchronization originates from the
+            [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
+        ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`):
+            α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix
+            between the current policy and the previous reference policy during updates. The reference policy is
+            updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you
+            must set `sync_ref_model=True`.
+        ref_model_sync_steps (`int`, *optional*, defaults to `512`):
+            τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how
+            frequently the current policy is synchronized with the reference policy. To use this parameter, you must
+            set `sync_ref_model=True`.
+
+        > Parameters that control the logging
+
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            Whether to generate and log completions from both the model and the reference model to W&B or Comet during
+            evaluation.
+
+        > Deprecated parameters
+
+        padding_value:
+
+            <Deprecated version="0.24.0">
+
+            This parameter is deprecated and will be removed in version 0.26.0. Use `pad_token` (`str`) instead.
+
+            </Deprecated>
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + [
+        "model_init_kwargs",
+        "ref_model_init_kwargs",
+    ]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    # Parameters that control the model and reference model
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments for `AutoModelForCausalLM.from_pretrained`, used when the `model` argument of "
+            "the `DPOTrainer` is provided as a string."
+        },
+    )
+    ref_model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments for `AutoModelForCausalLM.from_pretrained`, used when the `ref_model` argument "
+            "of the `DPOTrainer` is provided as a string."
+        },
+    )
+    model_adapter_name: str | None = field(
+        default=None,
+        metadata={
+            "help": "Name of the train target PEFT adapter, when using LoRA with multiple adapters."
+        },
+    )
+    ref_adapter_name: str | None = field(
+        default=None,
+        metadata={
+            "help": "Name of the reference PEFT adapter, when using LoRA with multiple adapters."
+        },
+    )
+    force_use_ref_model: bool = field(
+        default=False,
+        metadata={
+            "help": "If you provide a PEFT model as the active model and wish to use a different model for the "
+            "`ref_model`, set this flag to `True`."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to disable dropout in the model and reference model."
+        },
+    )
+    use_logits_to_keep: bool = field(
+        default=False,
+        metadata={
+            "help": "If `True`, only a specified number of logits are computed in the forward pass. This can be "
+            "useful for saving memory and speeding up training by not computing the logits for all tokens, especially "
+            "in scenarios when working with very long prompts where labels are ignored (-100)."
+        },
+    )
+
+    # Parameters that control the data preprocessing
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    pad_token: str | None = field(
+        default=None,
+        metadata={
+            "help": "Token used for padding. If `None`, it defaults to `processing_class.pad_token`, or if that "
+            "is also `None`, it falls back to `processing_class.eos_token`."
+        },
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={"help": "Padding value to use for labels."},
+    )
+    max_prompt_length: int | None = field(
+        default=512,
+        metadata={"help": "Maximum length of the prompt."},
+    )
+    max_completion_length: int | None = field(
+        default=None,
+        metadata={"help": "Maximum length of the completion."},
+    )
+    max_length: int | None = field(
+        default=1024,
+        metadata={"help": "Maximum length of the full sequence (prompt + completion)."},
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the sequence exceeds `max_length`. Possible values are `'keep_end'` "
+            "and `'keep_start'`.",
+            "choices": ["keep_end", "keep_start"],
+        },
+    )
+    padding_free: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to perform forward passes without padding by flattening all sequences in the batch into "
+            "a single continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, "
+            "this is only supported with the `flash_attention_2` attention implementation, which can efficiently "
+            "handle the flattened batch structure."
+        },
+    )
+    precompute_ref_log_probs: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to precompute the log probabilities from the reference model. Setting this to `True` "
+            "allows training without needing the reference model during training, which can help reduce GPU memory "
+            "usage. If set to `False` (default), the reference model will be used during training to compute log "
+            "probabilities on-the-fly."
+        },
+    )
+    precompute_ref_batch_size: int | None = field(
+        default=None,
+        metadata={
+            "help": "Batch size to use when precomputing reference model log probabilities. This can be set higher "
+            "than the training batch size to speed up preprocessing. If `None`, defaults to "
+            "`per_device_train_batch_size` for training and `per_device_eval_batch_size` for evaluation."
+        },
+    )
+    tools: list[dict] | None = field(
+        default=None,
+        metadata={
+            "help": "List of tools (callable functions) that will be accessible to the model. If the template does "
+            "not support function calling, this argument will have no effect."
+        },
+    )
+
+    # Parameters that control the training
+    loss_type: list[str] = field(
+        default_factory=lambda: ["sigmoid"],
+        metadata={
+            "help": "Type of loss to use. Possible values are: `'sigmoid'`, `'hinge'`, `'ipo'`, `'exo_pair'`, "
+            "`'nca_pair'`, `'robust'`, `'bco_pair'`, `'sppo_hard'`, `'aot'`, `'aot_pair'`, `'discopop'`, "
+            "`'apo_zero'`, `'apo_down'` and `'sft'`. Multiple loss types can be combined using comma separation "
+            "(e.g., `['sigmoid', 'bco_pair', 'sft']` for MPO). The `loss_weights` parameter can be used to specify "
+            "corresponding weights for each loss type."
+        },
+    )
+    use_liger_loss: bool = field(
+        default=None,
+        metadata={"help": "Whether to use Liger loss."},
+    )
+    base_model_attribute_name: str = field(
+        default="model",
+        metadata={
+            "help": "Name of the attribute in the model that contains the base model. This is used to get the base "
+            "model  from the model when the model does not have a `get_decoder` method in the case when "
+            "`use_liger_kernel` is `True`."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. "
+            "Higher β means less deviation from the reference model."
+        },
+    )
+    f_divergence_type: FDivergenceType | str = field(
+        default=FDivergenceType.REVERSE_KL,
+        metadata={
+            "help": "Type of f-divergence regularization function to compute divergence between policy and reference "
+            "model."
+        },
+    )
+    f_alpha_divergence_coef: float = field(
+        default=1.0,
+        metadata={
+            "help": "α coefficient in the α-divergence u^-α regularization function for DPO loss."
+        },
+    )
+    reference_free: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to ignore the provided reference model and implicitly use a reference model that assigns "
+            "equal probability to all responses."
+        },
+    )
+    label_smoothing: float = field(
+        default=0.0,
+        metadata={
+            "help": "Robust DPO label smoothing parameter from the cDPO report and Robust DPO paper that should "
+            "be between `0.0` and `0.5`."
+        },
+    )
+    use_weighting: bool = field(
+        default=False,
+        metadata={"help": "Whether to weight the loss as done in the WPO paper."},
+    )
+    rpo_alpha: float | None = field(
+        default=None,
+        metadata={
+            "help": "α parameter from the RPO paper (v3), which controls the weighting of the NLL term in the loss. "
+            "If `None`, no weighting is applied and the loss is the same as the DPO loss. The paper recommends "
+            "`rpo_alpha=1.0`."
+        },
+    )
+    ld_alpha: float | None = field(
+        default=None,
+        metadata={
+            "help": "α parameter from the LD-DPO paper, which controls the weighting of the verbose token "
+            "log-probabilities in responses. If `None`, no weighting is applied to the verbose part, and the loss is "
+            "equivalent to the standard DPO loss. The paper recommends setting `ld_alpha` between `0.0` and `1.0`.",
+        },
+    )
+    discopop_tau: float = field(
+        default=0.05,
+        metadata={
+            "help": "τ/temperature parameter from the DiscoPOP paper, which controls the shape of log ratio modulated "
+            "loss. The paper recommends the default value `discopop_tau=0.05`."
+        },
+    )
+    loss_weights: list[float] | None = field(
+        default=None,
+        metadata={
+            "help": "List of loss weights for multi-loss combinations. Used when combining multiple loss types. "
+            "Example: `[0.8, 0.2, 1.0]` for MPO. If not provided, defaults to equal weights (`1.0`) for all loss "
+            "types."
+        },
+    )
+    sync_ref_model: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to synchronize the reference model with the active model every `ref_model_sync_steps` "
+            "steps, using the `ref_model_mixup_alpha` parameter."
+        },
+    )
+    ref_model_mixup_alpha: float = field(
+        default=0.6,
+        metadata={
+            "help": "α parameter from the TR-DPO paper, which controls the mix between the current policy and the "
+            "previous reference policy during updates. The reference policy is updated according to the equation: "
+            "`π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you must set `sync_ref_model=True`."
+        },
+    )
+    ref_model_sync_steps: int = field(
+        default=512,
+        metadata={
+            "help": "τ parameter from the TR-DPO paper, which determines how frequently the current policy is "
+            "synchronized with the reference policy. To use this parameter, you must set `sync_ref_model=True`."
+        },
+    )
+
+    # Parameters that control the logging
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to generate and log completions from both the model and the reference model to W&B, MLFLow "
+            "or Comet during evaluation."
+        },
+    )
+
+    # Deprecated arguments
+    padding_value: int | None = field(
+        default=None,
+        metadata={"help": "Deprecated, use `pad_token` (str) instead."},
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+        self.f_divergence_type = FDivergenceType(self.f_divergence_type)
+
+        # Normalize loss_type to string format for internal use
+        if hasattr(self.loss_type, "__len__") and len(self.loss_type) == 1:
+            self.loss_type = self.loss_type[0]
+
+        # Validate loss_type
+        if self.loss_weights is not None:
+            loss_types = (
+                self.loss_type if isinstance(self.loss_type, list) else [self.loss_type]
+            )
+            if len(self.loss_weights) != len(loss_types):
+                raise ValueError(
+                    f"Length of loss_weights list ({self.loss_weights}) must match number of loss types "
+                    f"({loss_types})."
+                )
+
+        if self.use_liger_loss is not None:
+            warnings.warn(
+                "The `use_liger_loss` argument is deprecated and will be removed in version 0.28.0. Please use "
+                "`use_liger_kernel` instead.",
+                FutureWarning,
+                stacklevel=2,
+            )
+            self.use_liger_kernel = self.use_liger_loss
+        super().__post_init__()
diff --git a/src/aixpert/training/training/trl/trainer/dpo_trainer.py b/src/aixpert/training/training/trl/trainer/dpo_trainer.py
new file mode 100644
index 0000000..4ac420c
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/dpo_trainer.py
@@ -0,0 +1,2402 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import random
+import textwrap
+import warnings
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Literal
+
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from accelerate import PartialState, logging
+from accelerate.utils import tqdm
+from datasets import Dataset, IterableDataset
+from torch import autocast, nn
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoProcessor,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+)
+from transformers.data.data_collator import DataCollatorMixin
+from transformers.integrations import (
+    is_comet_available,
+    is_mlflow_available,
+    is_wandb_available,
+)
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
+)
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalLoopOutput
+from transformers.utils import is_liger_kernel_available, is_peft_available
+
+from ..data_utils import maybe_apply_chat_template, maybe_extract_prompt
+from ..models import create_reference_model, prepare_deepspeed
+from ..models.utils import prepare_fsdp
+from .base_trainer import BaseTrainer
+from .callbacks import SyncRefModelCallback
+from .dpo_config import DPOConfig, FDivergenceConstants, FDivergenceType
+from .utils import (
+    RunningMoments,
+    cap_exp,
+    create_model_from_path,
+    disable_dropout_in_model,
+    empty_cache,
+    flush_left,
+    flush_right,
+    get_config_model_id,
+    log_table_to_comet_experiment,
+    pad,
+    pad_to_length,
+    peft_module_casting_to_bf16,
+    selective_log_softmax,
+)
+
+
+if is_peft_available():
+    from peft import (
+        PeftConfig,
+        PeftModel,
+        get_peft_model,
+        prepare_model_for_kbit_training,
+    )
+
+if is_liger_kernel_available():
+    from liger_kernel.chunked_loss import LigerFusedLinearDPOLoss
+
+
+if is_wandb_available():
+    import wandb
+
+if is_mlflow_available():
+    import mlflow
+
+
+logger = logging.get_logger(__name__)
+
+
+def shift_tokens_right(
+    input_ids: torch.Tensor, decoder_start_token_id: int
+) -> torch.Tensor:
+    """Shift input ids one token to the right, and pad with pad_token_id"""
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    return shifted_input_ids
+
+
+@dataclass
+class DataCollatorForPreference(DataCollatorMixin):
+    """
+    Data collator used for preference data. Inputs are dynamically padded to the maximum length of a batch if they are
+    not all of the same length.
+
+    Args:
+        pad_token_id (`int`):
+            Token ID to use for padding.
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            Type of Tensor to return. Only `"pt"` is currently supported.
+
+    Examples
+    --------
+    ```python
+    >>> from trl import DataCollatorForPreference
+
+    >>> collator = DataCollatorForPreference(pad_token_id=0)
+    >>> examples = [
+    ...     {"prompt_input_ids": [1, 2, 3], "chosen_input_ids": [4, 5], "rejected_input_ids": [6]},
+    ...     {"prompt_input_ids": [7, 8], "chosen_input_ids": [9, 10], "rejected_input_ids": [11, 12, 13]},
+    ... ]
+    >>> collator(examples)
+    {'prompt_input_ids': tensor([[1, 2, 3],
+                                 [0, 7, 8]]),
+     'prompt_attention_mask': tensor([[1, 1, 1],
+                                      [0, 1, 1]]),
+     'chosen_input_ids': tensor([[ 4,  5],
+                                 [ 9, 10]]),
+     'chosen_attention_mask': tensor([[1, 1],
+                                      [1, 1]]),
+     'rejected_input_ids': tensor([[ 6,  0,  0],
+                                   [11, 12, 13]]),
+     'rejected_attention_mask': tensor([[1, 0, 0],
+                                        [1, 1, 1]])
+    }
+    ```
+    """
+
+    pad_token_id: int
+    return_tensors: str = "pt"
+
+    def torch_call(
+        self, examples: list[list[int] | Any | dict[str, Any]]
+    ) -> dict[str, Any]:
+        # Convert to tensor
+        prompt_input_ids = [
+            torch.tensor(example["prompt_input_ids"]) for example in examples
+        ]
+        prompt_attention_mask = [
+            torch.ones_like(input_ids) for input_ids in prompt_input_ids
+        ]
+        chosen_input_ids = [
+            torch.tensor(example["chosen_input_ids"]) for example in examples
+        ]
+        chosen_attention_mask = [
+            torch.ones_like(input_ids) for input_ids in chosen_input_ids
+        ]
+        rejected_input_ids = [
+            torch.tensor(example["rejected_input_ids"]) for example in examples
+        ]
+        rejected_attention_mask = [
+            torch.ones_like(input_ids) for input_ids in rejected_input_ids
+        ]
+        if "pixel_values" in examples[0]:
+            pixel_values = [
+                torch.tensor(example["pixel_values"]) for example in examples
+            ]
+        if "pixel_attention_mask" in examples[0]:
+            pixel_attention_mask = [
+                torch.tensor(example["pixel_attention_mask"]) for example in examples
+            ]
+        if "ref_chosen_logps" in examples[0] and "ref_rejected_logps" in examples[0]:
+            ref_chosen_logps = torch.tensor(
+                [example["ref_chosen_logps"] for example in examples]
+            )
+            ref_rejected_logps = torch.tensor(
+                [example["ref_rejected_logps"] for example in examples]
+            )
+
+        # Pad
+        output = {}
+        output["prompt_input_ids"] = pad(
+            prompt_input_ids, padding_value=self.pad_token_id, padding_side="left"
+        )
+        output["prompt_attention_mask"] = pad(
+            prompt_attention_mask, padding_value=0, padding_side="left"
+        )
+        output["chosen_input_ids"] = pad(
+            chosen_input_ids, padding_value=self.pad_token_id
+        )
+        output["chosen_attention_mask"] = pad(chosen_attention_mask, padding_value=0)
+        output["rejected_input_ids"] = pad(
+            rejected_input_ids, padding_value=self.pad_token_id
+        )
+        output["rejected_attention_mask"] = pad(
+            rejected_attention_mask, padding_value=0
+        )
+        if "pixel_values" in examples[0]:
+            output["pixel_values"] = pad(pixel_values, padding_value=0.0)
+        if "pixel_attention_mask" in examples[0]:
+            output["pixel_attention_mask"] = pad(pixel_attention_mask, padding_value=0)
+        if "image_sizes" in examples[0]:
+            output["image_sizes"] = torch.tensor(
+                [example["image_sizes"] for example in examples]
+            )
+        if "ref_chosen_logps" in examples[0] and "ref_rejected_logps" in examples[0]:
+            output["ref_chosen_logps"] = ref_chosen_logps
+            output["ref_rejected_logps"] = ref_rejected_logps
+        if "token_type_ids" in examples[0]:
+            token_type_ids = [
+                torch.tensor(example["token_type_ids"]) for example in examples
+            ]
+            output["token_type_ids"] = pad(
+                token_type_ids, padding_value=0, padding_side="left"
+            )
+
+        return output
+
+
+class DPOTrainer(BaseTrainer):
+    """
+    Trainer for Direct Preference Optimization (DPO) method.
+
+    This class is a wrapper around the [`transformers.Trainer`] class and inherits all of its attributes and methods.
+
+    Args:
+        model (`str | PreTrainedModel`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        ref_model ([`PreTrainedModelWrapper`]):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        args ([`DPOConfig`], *optional*):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
+            Will default to [`DataCollatorForPreference`].
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. DPO supports [preference](#preference) type and. The format of the samples can
+            be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Dataset | IterableDataset]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If `None`, the processing class is loaded from the model's name
+            with [`~transformers.AutoTokenizer.from_pretrained`].
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
+            a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to
+            `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered
+            after the last eval batch to signal that the function needs to calculate and return the global summary
+            statistics rather than accumulating the batch-level statistics.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+            in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*):
+            A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
+            `args`. Incompatible with the `optimizers` argument.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
+            A function that preprocess the logits right before caching them at each evaluation step. Must take two
+            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
+            by this function will be reflected in the predictions received by `compute_metrics`.
+
+            Note that the labels (second parameter) will be `None` if the dataset does not have them.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+    """
+
+    _tag_names = ["trl", "dpo"]
+    _name = "DPO"
+    _paper = {
+        "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
+        "id": "2305.18290",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{rafailov2023direct,
+                title        = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}},
+                author       = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn},
+                year         = 2023,
+                booktitle    = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023},
+                url          = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html},
+                editor       = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine},
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: str | nn.Module | PreTrainedModel,
+        ref_model: PreTrainedModel | nn.Module | str | None = None,
+        args: DPOConfig | None = None,
+        data_collator: DataCollator | None = None,  # type: ignore
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset
+        | IterableDataset
+        | dict[str, Dataset | IterableDataset]
+        | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[
+            torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
+        ] = (None, None),
+        optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]]
+        | None = None,
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+        peft_config: "PeftConfig | None" = None,
+    ):
+        # Args
+        if args is None:
+            model_name = (
+                model if isinstance(model, str) else get_config_model_id(model.config)
+            )
+            model_name = model_name.split("/")[-1]
+            args = DPOConfig(f"{model_name}-DPO")
+
+        # Model and reference model
+        if isinstance(model, str):
+            model = create_model_from_path(model, **args.model_init_kwargs or {})
+        elif args.model_init_kwargs is not None:
+            logger.warning(
+                "You passed `model_init_kwargs` to the `DPOConfig`, but your model is already instantiated. "
+                "The `model_init_kwargs` will be ignored."
+            )
+        model_id = get_config_model_id(model.config)
+        if isinstance(ref_model, str):
+            ref_model = create_model_from_path(
+                ref_model, **args.ref_model_init_kwargs or {}
+            )
+        elif args.ref_model_init_kwargs is not None:
+            logger.warning(
+                "You passed `ref_model_init_kwargs` to the `DPOConfig`, but your model is already instantiated. "
+                "The `ref_model_init_kwargs` will be ignored."
+            )
+        if ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you can simply omit the `ref_model` argument and it will be created for you."
+            )
+
+        # Processing class
+        if processing_class is None:
+            processing_class = AutoProcessor.from_pretrained(model_id)
+
+        # Handle pad token for processors or tokenizers
+        if isinstance(processing_class, ProcessorMixin):
+            tokenizer = processing_class.tokenizer
+            self._is_vlm = True
+        elif isinstance(processing_class, PreTrainedTokenizerBase):
+            tokenizer = processing_class
+            self._is_vlm = False
+        else:
+            raise TypeError(
+                "The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`"
+            )
+
+        # Get the pad token: if not provided, use the one from the processing class or the eos token
+        # if the processing class does not have a pad token.
+        if args.padding_value is not None:  # deprecated, will be removed in 0.26.0.
+            warnings.warn(
+                "The `padding_value` argument is deprecated and will be removed in version 0.26.0. Please use "
+                "`pad_token` (str) instead."
+            )
+            self.pad_token_id = args.padding_value
+        else:
+            pad_token = args.pad_token or tokenizer.pad_token or tokenizer.eos_token
+            self.pad_token_id = tokenizer.convert_tokens_to_ids(pad_token)
+            if self.pad_token_id is None:
+                raise ValueError(
+                    f"The specified `pad_token` ('{pad_token}') is not found in the vocabulary of the given "
+                    f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `pad_token` exists "
+                    "in the vocabulary before using it as a padding token."
+                )
+
+        # PEFT configuration and model wrapping
+        model = self._prepare_peft_model(model, ref_model, peft_config, args)
+
+        if args.generate_during_eval and not (
+            is_wandb_available() or is_comet_available() or is_mlflow_available()
+        ):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases, MLFlow or Comet to be installed."
+                " Please install `wandb`, `mlflow` or `comet-ml` to resolve."
+            )
+
+        self.is_encoder_decoder = model.config.is_encoder_decoder
+        self.is_vision_model = (
+            model.config.model_type in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.keys()
+        )
+        self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
+        self.model_adapter_name = args.model_adapter_name
+        self.ref_adapter_name = args.ref_adapter_name
+        self.reference_free = args.reference_free
+
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model or args.precompute_ref_log_probs:
+            # The `model` with adapters turned off will be used as the reference model
+            self.ref_model = None
+        else:
+            self.ref_model = create_reference_model(model)
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        # Liger kernel
+        if args.use_liger_kernel:
+            if not is_liger_kernel_available():
+                raise ImportError(
+                    "You set `use_liger_kernel=True` but the liger kernel is not available. "
+                    "Please install liger-kernel first: `pip install liger-kernel`"
+                )
+            if args.loss_type not in [
+                "sigmoid",
+                "apo_zero",
+                "apo_down",
+                "sppo_hard",
+                "nca_pair",
+            ]:
+                raise ValueError(
+                    "You set `use_liger_kernel=True` but the loss type is not from `[sigmoid, apo_zero, apo_down, sppo_hard, nca_pair`. "
+                    "Please set `loss_type='[sigmoid | apo_zero | apo_down | sppo_hard | nca_pair]'` to use the liger kernel."
+                )
+            self.dpo_loss_fn = LigerFusedLinearDPOLoss(
+                ignore_index=args.label_pad_token_id,
+                beta=args.beta,
+                use_ref_model=not args.reference_free,
+                average_log_prob=False,
+                loss_type=args.loss_type,
+            )
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in DPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and
+        # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens
+        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
+        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
+        # that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Data collator
+        if data_collator is None:
+            data_collator = DataCollatorForPreference(pad_token_id=self.pad_token_id)
+
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.max_prompt_length = args.max_prompt_length
+        self.max_completion_length = args.max_completion_length
+        self.max_length = args.max_length
+        self.truncation_mode = args.truncation_mode
+        self.precompute_ref_log_probs = args.precompute_ref_log_probs
+        self.use_logits_to_keep = args.use_logits_to_keep
+
+        if args.padding_free:
+            if model.config._attn_implementation != "flash_attention_2":
+                logger.warning(
+                    "Padding-free training is enabled, but the attention implementation is not set to "
+                    "'flash_attention_2'. Padding-free training flattens batches into a single sequence, and "
+                    "'flash_attention_2' is the only known attention mechanism that reliably supports this. Using "
+                    "other implementations may lead to unexpected behavior. To ensure compatibility, set "
+                    "`attn_implementation='flash_attention_2'` in the model configuration, or verify that your "
+                    "attention mechanism can handle flattened sequences."
+                )
+            if args.per_device_train_batch_size == 1:
+                logger.warning(
+                    "You are using a per_device_train_batch_size of 1 with padding-free training. Using a batch size "
+                    "of 1 anihilate the benefits of padding-free training. Please consider increasing the batch size "
+                    "to at least 2."
+                )
+        self.padding_free = args.padding_free
+
+        # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
+        # keep track of first called to avoid computation of future calls
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+
+        self.beta = args.beta
+        self.label_smoothing = args.label_smoothing
+        self.loss_type = (
+            args.loss_type if isinstance(args.loss_type, list) else [args.loss_type]
+        )
+        self.loss_weights = args.loss_weights
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.use_weighting = args.use_weighting
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+        for loss_type in self.loss_type:
+            if (
+                loss_type
+                in [
+                    "hinge",
+                    "ipo",
+                    "bco_pair",
+                    "sppo_hard",
+                    "nca_pair",
+                    "apo_zero",
+                    "apo_down",
+                ]
+                and args.label_smoothing > 0
+            ):
+                logger.warning(
+                    f"You are using the {loss_type} loss type that does not support label smoothing. The "
+                    "`label_smoothing` parameter will be ignored. Set `label_smoothing` to `0.0` to remove this "
+                    "warning.",
+                )
+            if loss_type == "kto_pair":
+                raise ValueError(
+                    "Support for kto_pair has been removed in DPOTrainer. Please use KTOTrainer."
+                )
+
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+        self.f_divergence_type = args.f_divergence_type
+        self.f_divergence_params = {
+            FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY: args.f_alpha_divergence_coef
+        }
+        self.dataset_num_proc = args.dataset_num_proc
+
+        # Dataset preparation
+        train_dataset = self._prepare_dataset(
+            train_dataset, processing_class, args, "train"
+        )
+        if eval_dataset is not None:
+            if isinstance(eval_dataset, dict):
+                eval_dataset = {
+                    key: self._prepare_dataset(dataset, processing_class, args, key)
+                    for key, dataset in eval_dataset.items()
+                }
+            else:
+                eval_dataset = self._prepare_dataset(
+                    eval_dataset, processing_class, args, "eval"
+                )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+        # Deepspeed Zero-3 does not support precompute_ref_log_probs
+        if self.is_deepspeed_enabled:
+            if (
+                self.accelerator.state.deepspeed_plugin.zero_stage == 3
+                and self.precompute_ref_log_probs
+            ):
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`."
+                )
+
+        if self.ref_model is None:
+            if not (self.is_peft_model or self.precompute_ref_log_probs):
+                raise ValueError(
+                    "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`"
+                )
+            if args.sync_ref_model:
+                raise ValueError(
+                    "You currently cannot use `ref_model=None` with TR-DPO method. Please provide `ref_model`."
+                )
+        elif self.is_deepspeed_enabled:
+            self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+        elif self.is_fsdp_enabled:
+            self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
+        else:
+            self.ref_model = self.accelerator.prepare_model(
+                self.ref_model, evaluation_mode=True
+            )
+
+        if args.sync_ref_model:
+            if self.precompute_ref_log_probs:
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with TR-DPO method. Please set `precompute_ref_log_probs=False`."
+                )
+
+            self.add_callback(
+                SyncRefModelCallback(
+                    ref_model=self.ref_model, accelerator=self.accelerator
+                )
+            )
+
+        if "bco_pair" in self.loss_type:
+            self.running = RunningMoments(self.accelerator)
+
+    @property
+    def padding_value(self):
+        warnings.warn(
+            "The `padding_value` property is deprecated and will be removed in version 0.26.0. Please use "
+            "`pad_token_id` instead.",
+        )
+        return self.pad_token_id
+
+    @padding_value.setter
+    def padding_value(self, value):
+        warnings.warn(
+            "The `padding_value` property is deprecated and will be removed in version 0.26.0. Please use "
+            "`pad_token_id` instead.",
+        )
+        self.pad_token_id = value
+
+    def _prepare_peft_model(
+        self,
+        model: PreTrainedModel,
+        ref_model: PreTrainedModel,
+        peft_config: Any,
+        args: DPOConfig,
+    ) -> PreTrainedModel:
+        """Prepares a model for PEFT training."""
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        if is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if ref_model is not None and not args.force_use_ref_model:
+                raise ValueError(
+                    "You passed both a ref_model and a peft_config. For training PEFT adapters with DPO there is no need to pass a reference"
+                    " model. Please pass `ref_model=None` in case you want to train PEFT adapters, or pass a ref_model with `force_use_ref_model=True` in DPOTrainer's init."
+                    " if you want to use a different ref_model."
+                )
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(
+                model, "is_loaded_in_4bit", False
+            ):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {
+                    "use_gradient_checkpointing": args.gradient_checkpointing
+                }
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = (
+                        args.gradient_checkpointing_kwargs
+                    )
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+
+            else:
+                model = self._prepare_gradient_checkpointing(model, args)
+
+            # get peft model with the given config
+            model = get_peft_model(model, peft_config)
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        else:
+            model = self._prepare_gradient_checkpointing(model, args)
+
+        return model
+
+    def _prepare_gradient_checkpointing(self, model: PreTrainedModel, args: DPOConfig):
+        """Prepare the gradienting checkpointing for the model."""
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        if args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(
+                    make_inputs_require_grad
+                )
+
+        return model
+
+    def _prepare_dataset(
+        self,
+        dataset: Dataset | IterableDataset,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin,
+        args: DPOConfig,
+        dataset_name: str,
+    ) -> Dataset | IterableDataset:
+        # Build the kwargs for the `map` function
+        map_kwargs = {}
+        if isinstance(
+            dataset, Dataset
+        ):  # IterableDataset does not support num_proc nor writer_batch_size
+            map_kwargs["num_proc"] = args.dataset_num_proc
+            map_kwargs["writer_batch_size"] = 10
+
+        with PartialState().main_process_first():
+            # Extract prompt if needed
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Extracting prompt in {dataset_name} dataset"
+            dataset = dataset.map(maybe_extract_prompt, **map_kwargs)
+
+            # Apply the chat template if needed
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Applying chat template to {dataset_name} dataset"
+            dataset = dataset.map(
+                maybe_apply_chat_template,
+                fn_kwargs={"tokenizer": processing_class, "tools": args.tools},
+                **map_kwargs,
+            )
+
+            # Tokenize the dataset
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
+
+            dataset = dataset.map(
+                self.tokenize_row if not self.is_vision_model else self.process_row,
+                remove_columns=["chosen", "rejected"],
+                fn_kwargs={
+                    "processing_class": processing_class,
+                    "max_prompt_length": args.max_prompt_length,
+                    "max_completion_length": args.max_completion_length,
+                    # for enc-dec, we add the special tokens ([bos_token] + prompt + [eos_token]; completion + [eos_token])
+                    "add_special_tokens": False,
+                },
+                **map_kwargs,
+            )
+
+        return dataset
+
+    @staticmethod
+    def tokenize_row(
+        features: dict[str, str],
+        processing_class: PreTrainedTokenizerBase,
+        max_prompt_length: int | None = None,
+        max_completion_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> dict[str, list[int]]:
+        """
+        Tokenize a row of the dataset.
+
+        Args:
+            features (`dict[str, str]`):
+                Row of the dataset, should contain the keys `"prompt"`, `"chosen"`, and `"rejected"`.
+            processing_class ([`~transformers.PreTrainedTokenizerBase`]):
+                Processing class used to process the data.
+            max_prompt_length (`int` or `None`):
+                Maximum length of the prompt sequence. If `None`, the prompt sequence is not truncated.
+            max_completion_length (`int` or `None`):
+                Maximum length of the completion sequences. If `None`, the completion sequences are not truncated.
+            add_special_tokens (`bool`):
+                Whether to add special tokens to the sequences. Typically used for encoder-decoder models. If `True`,
+                the prompt sequence will have a bos token prepended and an eos token appended. In any case, the
+                completion sequences will have an eos token appended.
+
+        Returns
+        -------
+            `dict[str, list[int]]`:
+                Tokenized sequences with the keys `"prompt_input_ids"`, `"chosen_input_ids"`, and
+                `"rejected_input_ids".
+
+        Example:
+        ```python
+        >>> from transformers import GPT2Tokenizer
+
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> features = {"prompt": "The sky is", "chosen": " blue", "rejected": " green"}
+        >>> DPOTrainer.tokenize_row(
+        ...     features, tokenizer, max_prompt_length=3, max_completion_length=3, add_special_tokens=False
+        ... )
+        {'prompt_input_ids': [464, 6766, 318], 'chosen_input_ids': [4171, 50256], 'rejected_input_ids': [4077, 50256]}
+        ```
+        """
+        tokenizer = processing_class  # the processing class is a tokenizer
+        prompt_input_ids = tokenizer(features["prompt"], add_special_tokens=False)[
+            "input_ids"
+        ]
+        chosen_input_ids = tokenizer(features["chosen"], add_special_tokens=False)[
+            "input_ids"
+        ]
+        rejected_input_ids = tokenizer(features["rejected"], add_special_tokens=False)[
+            "input_ids"
+        ]
+
+        # Add special tokens (typically for encoder-decoder models)
+        if add_special_tokens:
+            if tokenizer.bos_token_id is not None:
+                prompt_input_ids = [tokenizer.bos_token_id] + prompt_input_ids
+            if tokenizer.eos_token_id is not None:
+                prompt_input_ids = prompt_input_ids + [tokenizer.eos_token_id]
+        chosen_input_ids = chosen_input_ids + [tokenizer.eos_token_id]
+        rejected_input_ids = rejected_input_ids + [tokenizer.eos_token_id]
+
+        # Truncate prompt and completion sequences
+        if max_prompt_length is not None:
+            prompt_input_ids = prompt_input_ids[-max_prompt_length:]
+        if max_completion_length is not None:
+            chosen_input_ids = chosen_input_ids[:max_completion_length]
+            rejected_input_ids = rejected_input_ids[:max_completion_length]
+
+        return {
+            "prompt_input_ids": prompt_input_ids,
+            "chosen_input_ids": chosen_input_ids,
+            "rejected_input_ids": rejected_input_ids,
+        }
+
+    @staticmethod
+    def process_row(
+        features: dict[str, str],
+        processing_class: PreTrainedTokenizerBase,
+        max_prompt_length: int | None = None,
+        max_completion_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> dict[str, list[int]]:
+        """
+        Same as `tokenize_row` but for vision models. Please refer to `tokenize_row` for more information.
+        """
+        processor, tokenizer = (
+            processing_class,
+            processing_class.tokenizer,
+        )  # the processing class is a processor
+        processed_features = processor(
+            images=features["images"], text=features["prompt"], add_special_tokens=False
+        )
+
+        prompt_input_ids = processed_features["input_ids"][0]
+        pixel_values = processed_features["pixel_values"][0]
+        chosen_input_ids = tokenizer(features["chosen"], add_special_tokens=False)[
+            "input_ids"
+        ]
+        rejected_input_ids = tokenizer(features["rejected"], add_special_tokens=False)[
+            "input_ids"
+        ]
+
+        # Add special tokens (typically for encoder-decoder models)
+        if add_special_tokens:
+            if tokenizer.bos_token_id is not None:
+                prompt_input_ids = [tokenizer.bos_token_id] + prompt_input_ids
+            if tokenizer.eos_token_id is not None:
+                prompt_input_ids = prompt_input_ids + [tokenizer.eos_token_id]
+        chosen_input_ids = chosen_input_ids + [tokenizer.eos_token_id]
+        rejected_input_ids = rejected_input_ids + [tokenizer.eos_token_id]
+
+        # Truncate prompt and completion sequences
+        if max_prompt_length is not None:
+            prompt_input_ids = prompt_input_ids[-max_prompt_length:]
+        if max_completion_length is not None:
+            chosen_input_ids = chosen_input_ids[:max_completion_length]
+            rejected_input_ids = rejected_input_ids[:max_completion_length]
+
+        output = {
+            "prompt_input_ids": prompt_input_ids,
+            "pixel_values": pixel_values,
+            "chosen_input_ids": chosen_input_ids,
+            "rejected_input_ids": rejected_input_ids,
+        }
+
+        if "pixel_attention_mask" in processed_features:
+            output["pixel_attention_mask"] = processed_features["pixel_attention_mask"][
+                0
+            ]
+        if "image_sizes" in processed_features:
+            output["image_sizes"] = processed_features["image_sizes"][0]
+        if "token_type_ids" in processed_features:
+            output["token_type_ids"] = processed_features["token_type_ids"][0]
+
+        return output
+
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In DPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by `DataCollatorForPreference`, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = [
+                "prompt_input_ids",
+                "chosen_input_ids",
+                "rejected_input_ids",
+                "image_sizes",
+                "token_type_ids",
+                "ref_chosen_logps",
+                "ref_rejected_logps",
+            ]
+
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`.
+        """
+        if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs:
+            batch_size = (
+                self.args.precompute_ref_batch_size
+                or self.args.per_device_train_batch_size
+            )
+            dataloader_params = {
+                "batch_size": batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(
+                DataLoader(self.train_dataset, **dataloader_params)
+            )
+
+            ref_chosen_logps = []
+            ref_rejected_logps = []
+            for padded_batch in tqdm(
+                iterable=data_loader, desc="Train dataset reference log probs"
+            ):
+                ref_chosen_logp, ref_rejected_logp = self.compute_ref_log_probs(
+                    padded_batch
+                )
+                ref_chosen_logp, ref_rejected_logp = (
+                    self.accelerator.gather_for_metrics(
+                        (ref_chosen_logp, ref_rejected_logp)
+                    )
+                )
+                ref_chosen_logps.append(ref_chosen_logp.cpu())
+                ref_rejected_logps.append(ref_rejected_logp.cpu())
+
+                # Unnecessary cache clearing to avoid OOM
+                empty_cache()
+                self.accelerator.free_memory()
+
+            all_ref_chosen_logps = torch.cat(ref_chosen_logps).float().numpy()
+            all_ref_rejected_logps = torch.cat(ref_rejected_logps).float().numpy()
+
+            self.train_dataset = self.train_dataset.add_column(
+                name="ref_chosen_logps", column=all_ref_chosen_logps
+            )
+            self.train_dataset = self.train_dataset.add_column(
+                name="ref_rejected_logps", column=all_ref_rejected_logps
+            )
+
+            self._precomputed_train_ref_log_probs = True
+
+        return super().get_train_dataloader()
+
+    def get_eval_dataloader(self, eval_dataset: Dataset | None = None) -> DataLoader:
+        """
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`.
+
+        Args:
+            eval_dataset (`torch.utils.data.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
+                by the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
+        if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs:
+            batch_size = (
+                self.args.precompute_ref_batch_size
+                or self.args.per_device_eval_batch_size
+            )
+            dataloader_params = {
+                "batch_size": batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(
+                DataLoader(eval_dataset, **dataloader_params)
+            )
+
+            ref_chosen_logps = []
+            ref_rejected_logps = []
+            for padded_batch in tqdm(
+                iterable=data_loader, desc="Eval dataset reference log probs"
+            ):
+                ref_chosen_logp, ref_rejected_logp = self.compute_ref_log_probs(
+                    padded_batch
+                )
+                ref_chosen_logp, ref_rejected_logp = (
+                    self.accelerator.gather_for_metrics(
+                        (ref_chosen_logp, ref_rejected_logp)
+                    )
+                )
+                ref_chosen_logps.append(ref_chosen_logp.cpu())
+                ref_rejected_logps.append(ref_rejected_logp.cpu())
+
+            all_ref_chosen_logps = torch.cat(ref_chosen_logps).float().numpy()
+            all_ref_rejected_logps = torch.cat(ref_rejected_logps).float().numpy()
+
+            eval_dataset = eval_dataset.add_column(
+                name="ref_chosen_logps", column=all_ref_chosen_logps
+            )
+            eval_dataset = eval_dataset.add_column(
+                name="ref_rejected_logps", column=all_ref_rejected_logps
+            )
+
+            # Save calculated ref_chosen_logps and ref_rejected_logps to the eval_dataset for subsequent runs
+            if self.eval_dataset is not None:
+                self.eval_dataset = eval_dataset
+            self._precomputed_eval_ref_log_probs = True
+
+        return super().get_eval_dataloader(eval_dataset=eval_dataset)
+
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with (
+            self.accelerator.unwrap_model(self.model).disable_adapter()
+            if self.is_peft_model and not self.ref_adapter_name
+            else nullcontext()
+        ):
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.model_adapter_name or "default")
+
+    def compute_ref_log_probs(
+        self, batch: dict[str, torch.LongTensor]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Computes log probabilities of the reference model for a single padded batch of a DPO specific dataset."""
+        compte_ref_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+        with torch.no_grad(), compte_ref_context_manager:
+            if self.ref_model is None:
+                with self.null_ref_context():
+                    ref_model_output = self.concatenated_forward(
+                        self.model, batch, is_ref_model=True
+                    )
+            else:
+                ref_model_output = self.concatenated_forward(
+                    self.ref_model, batch, is_ref_model=True
+                )
+        return ref_model_output["chosen_logps"], ref_model_output["rejected_logps"]
+
+    @staticmethod
+    def concatenated_inputs(
+        batch: dict[str, list | torch.LongTensor], padding_value: int
+    ) -> dict[str, torch.LongTensor]:
+        """
+        Concatenate the `chosen` and `rejected` inputs from the batch into a single tensor for both the prompt and
+        completion sequences.
+
+        Args:
+            batch (`dict[str, list | torch.LongTensor]`):
+                A batch of input data. The batch must contain the following keys:
+
+                - `"prompt_input_ids"`: Tensor of shape `(batch_size, prompt_length)` representing the prompt input
+                  IDs.
+                - `"chosen_input_ids"`: Tensor of shape `(batch_size, chosen_length)` representing the chosen
+                  completion input IDs.
+                - `"rejected_input_ids"`: Tensor of shape `(batch_size, rejected_length)` representing the rejected
+                  completion input IDs.
+                - `"prompt_pixel_values"` (optional): Tensor for pixel values, if available.
+                - `"prompt_pixel_attention_mask"` (optional): Tensor for pixel attention masks, if available.
+
+            padding_value (`int`):
+                The padding value to use for the concatenated completion sequences (`chosen_input_ids` and
+                `rejected_input_ids`).
+
+        Returns
+        -------
+            `dict[str, torch.LongTensor]`: A dictionary containing:
+
+                - `"prompt_input_ids"`: Concatenated prompt input IDs of shape `(2 * batch_size, prompt_length)`.
+                - `"completion_input_ids"`: Concatenated chosen and rejected completion input IDs of shape `(2 *
+                  batch_size, max_completion_length)`.
+                - `"prompt_attention_mask"`: Concatenated prompt attention masks of shape `(2 * batch_size,
+                  prompt_length)`.
+                - `"completion_attention_mask"`: Concatenated chosen and rejected attention masks of shape `(2 *
+                  batch_size, max_completion_length)`.
+                - `"pixel_values"` (optional): Concatenated pixel values if `"prompt_pixel_values"` are present.
+                - `"pixel_attention_mask"` (optional): Concatenated pixel attention masks if
+                  `"prompt_pixel_attention_mask"` are present.
+
+        Notes
+        -----
+            The completion input IDs and attention masks are padded to the maximum completion length of the chosen or
+            rejected sequences.
+        """
+        output = {}
+
+        # For the prompt, the input_ids are the same for both the chosen and rejected responses
+        output["prompt_input_ids"] = torch.cat(
+            [batch["prompt_input_ids"], batch["prompt_input_ids"]], dim=0
+        )
+        output["prompt_attention_mask"] = torch.cat(
+            [batch["prompt_attention_mask"], batch["prompt_attention_mask"]], dim=0
+        )
+        if "pixel_values" in batch:
+            output["pixel_values"] = torch.cat(
+                [batch["pixel_values"], batch["pixel_values"]], dim=0
+            )
+
+        if "pixel_attention_mask" in batch:
+            output["pixel_attention_mask"] = torch.cat(
+                [batch["pixel_attention_mask"], batch["pixel_attention_mask"]], dim=0
+            )
+        if "image_sizes" in batch:
+            output["image_sizes"] = torch.cat(
+                [batch["image_sizes"], batch["image_sizes"]], dim=0
+            )
+        if "token_type_ids" in batch:
+            output["token_type_ids"] = torch.cat(
+                (batch["token_type_ids"], batch["token_type_ids"])
+            )
+
+        # Concatenate the chosen and rejected completions
+        max_completion_length = max(
+            batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1]
+        )
+        output["completion_input_ids"] = torch.cat(
+            (
+                pad_to_length(
+                    batch["chosen_input_ids"],
+                    max_completion_length,
+                    pad_value=padding_value,
+                ),
+                pad_to_length(
+                    batch["rejected_input_ids"],
+                    max_completion_length,
+                    pad_value=padding_value,
+                ),
+            ),
+        )
+        output["completion_attention_mask"] = torch.cat(
+            (
+                pad_to_length(
+                    batch["chosen_attention_mask"], max_completion_length, pad_value=0
+                ),
+                pad_to_length(
+                    batch["rejected_attention_mask"], max_completion_length, pad_value=0
+                ),
+            ),
+        )
+
+        return output
+
+    def dpo_loss(
+        self,
+        chosen_logps: torch.FloatTensor,
+        rejected_logps: torch.FloatTensor,
+        ref_chosen_logps: torch.FloatTensor,
+        ref_rejected_logps: torch.FloatTensor,
+        loss_type: str = "sigmoid",
+        model_output: dict[str, torch.FloatTensor] = None,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """
+        Compute the DPO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            chosen_logps (`torch.FloatTensor`):
+                Log probabilities of the model for the chosen responses. Shape: `(batch_size,)`.
+            rejected_logps (`torch.FloatTensor`):
+                Log probabilities of the model for the rejected responses. Shape: `(batch_size,)`.
+            ref_chosen_logps (`torch.FloatTensor`):
+                Log probabilities of the reference model for the chosen responses. Shape: `(batch_size,)`.
+            ref_rejected_logps (`torch.FloatTensor`):
+                Log probabilities of the reference model for the rejected responses. Shape: `(batch_size,)`.
+            loss_type (`str`, defaults to `"sigmoid"`):
+                The type of loss to compute. One of:
+                - `"sigmoid"`: Sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+                - `"hinge"`: Hinge loss on the normalized likelihood from the
+                  [SLiC](https://huggingface.co/papers/2305.10425) paper.
+                - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+                - `"exo_pair"`: Pairwise EXO loss from the [EXO](https://huggingface.co/papers/2402.00856) paper.
+                - `"nca_pair"`: Pairwise NCA loss from the [NCA](https://huggingface.co/papers/2402.05369) paper.
+                - `"robust"`: Unbiased estimate of the DPO loss that is robust to preference noise from the [Robust
+                  DPO](https://huggingface.co/papers/2403.00409) paper.
+                - `"bco_pair"`: Pairwise BCO loss from the [BCO](https://huggingface.co/papers/2404.04656) paper.
+                - `"sppo_hard"`: SPPO loss with hard label from the [SPPO](https://huggingface.co/papers/2405.00675)
+                  paper.
+                - `"aot"`: AOT loss for paired datasets from the [AOT](https://huggingface.co/papers/2406.05882) paper.
+                - `"aot_pair"`: AOT loss for unpaired datasets from the [AOT](https://huggingface.co/papers/2406.05882)
+                  paper.
+                - `"discopop"`: DiscoPOP (a.k.a Log-Ratio Modulated Loss, LRML) loss from the
+                  [DiscoPOP](https://huggingface.co/papers/2406.08414) paper.
+                - `"apo_zero"`: APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
+                - `"apo_down"`: APO-down loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
+                - `"sft"`: Negative log-likelihood loss (standard supervised fine-tuning loss).
+            model_output (`dict[str, torch.FloatTensor]`, *optional*):
+                The output of the model's forward pass. This is used to compute auxiliary losses if enabled.
+
+        Returns
+        -------
+            A tuple of three tensors: `(losses, chosen_rewards, rejected_rewards)`. The losses tensor contains the DPO
+            loss for each example in the batch. The `chosen_rewards` and `rejected_rewards` tensors contain the rewards
+            for the chosen and rejected responses, respectively.
+        """
+        device = self.accelerator.device
+
+        # Get the log ratios for the chosen and rejected responses
+        chosen_logratios = chosen_logps.to(device) - (
+            not self.reference_free
+        ) * ref_chosen_logps.to(device)
+        rejected_logratios = rejected_logps.to(device) - (
+            not self.reference_free
+        ) * ref_rejected_logps.to(device)
+
+        if self.f_divergence_type == FDivergenceType.ALPHA_DIVERGENCE:
+            # The alpha-divergence formula: (1 - u^-alpha) / alpha
+            # The divergence difference between the chosen and rejected sample is:
+            #     (1 - u[w]^-alpha) / alpha - (1 - u[l]^-alpha) / alpha
+            #        = (u[l]^-alpha - u[w]^-alpha) / alpha
+            # where u[w] and u[l] are the policy/reference probability ratios
+            # for the chosen and rejected samples, respectively.
+            alpha_coef = FDivergenceConstants.ALPHA_DIVERGENCE_COEF_DEFAULT
+            if (
+                self.f_divergence_params
+                and FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY
+                in self.f_divergence_params
+            ):
+                alpha_coef = float(
+                    self.f_divergence_params[
+                        FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY
+                    ]
+                )
+            logits = (
+                cap_exp(rejected_logratios * -alpha_coef)
+                - cap_exp(chosen_logratios * -alpha_coef)
+            ) / alpha_coef
+        else:
+            logratios = chosen_logps - rejected_logps
+            if self.reference_free:
+                ref_logratios = torch.tensor(
+                    [0], dtype=logratios.dtype, device=logratios.device
+                )
+            else:
+                ref_logratios = ref_chosen_logps - ref_rejected_logps
+
+            logratios = logratios.to(self.accelerator.device)
+            ref_logratios = ref_logratios.to(self.accelerator.device)
+            logits = logratios - ref_logratios
+
+            if self.f_divergence_type == FDivergenceType.JS_DIVERGENCE:
+                # The js-divergence formula: log(2 * u / (1 + u))
+                # The divergence difference between the chosen and rejected sample is:
+                #     log(2 * u[w] / (1 + u[w])) - log(2 * u[l] / (1 + u[l]))
+                #       = log(u[w]) - log(u[l]) - (log(1 + u[w]) - log(1 + u[l]))
+                # where u[w] and u[l] are the policy/reference probability ratios
+                # for the chosen and rejected samples, respectively.
+                logits -= F.softplus(chosen_logratios) - F.softplus(rejected_logratios)
+
+        # The beta is a temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5.
+        # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the
+        # labels and calculates a conservative DPO loss.
+        if loss_type == "sigmoid":
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+
+        elif loss_type == "robust":
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                + F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            ) / (1 - 2 * self.label_smoothing)
+
+        elif loss_type == "exo_pair":
+            # eqn (16) of the EXO paper: https://huggingface.co/papers/2402.00856
+            import math
+
+            if self.label_smoothing == 0:
+                self.label_smoothing = 1e-3
+            losses = (self.beta * logits).sigmoid() * (
+                F.logsigmoid(self.beta * logits) - math.log(1 - self.label_smoothing)
+            ) + (-self.beta * logits).sigmoid() * (
+                F.logsigmoid(-self.beta * logits) - math.log(self.label_smoothing)
+            )
+
+        elif loss_type == "hinge":
+            losses = torch.relu(1 - self.beta * logits)
+
+        elif loss_type == "ipo":
+            # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper.
+            losses = (logits - 1 / (2 * self.beta)) ** 2
+
+        elif loss_type == "bco_pair":
+            chosen_logratios = chosen_logps - ref_chosen_logps
+            rejected_logratios = rejected_logps - ref_rejected_logps
+            chosen_rewards = self.beta * chosen_logratios
+            rejected_rewards = self.beta * rejected_logratios
+            rewards = torch.cat((chosen_rewards, rejected_rewards), 0).mean().detach()
+            self.running.update(rewards)
+            delta = self.running.mean
+            losses = -F.logsigmoid(
+                (self.beta * chosen_logratios) - delta
+            ) - F.logsigmoid(-(self.beta * rejected_logratios - delta))
+
+        elif loss_type == "sppo_hard":
+            # In the paper (https://huggingface.co/papers/2405.00675), SPPO employs a soft probability approach,
+            # estimated using the PairRM score. The probability calculation is conducted outside of the trainer class.
+            # The version described here is the hard probability version, where P in Equation (4.7) of Algorithm 1 is
+            # set to 1 for the winner and 0 for the loser.
+            a = chosen_logps - ref_chosen_logps
+            b = rejected_logps - ref_rejected_logps
+            losses = (a - 0.5 / self.beta) ** 2 + (b + 0.5 / self.beta) ** 2
+
+        elif loss_type == "nca_pair":
+            chosen_rewards = (chosen_logps - ref_chosen_logps) * self.beta
+            rejected_rewards = (rejected_logps - ref_rejected_logps) * self.beta
+            losses = (
+                -F.logsigmoid(chosen_rewards)
+                - 0.5 * F.logsigmoid(-chosen_rewards)
+                - 0.5 * F.logsigmoid(-rejected_rewards)
+            )
+
+        elif loss_type == "aot_pair":
+            chosen_logratios = chosen_logps - ref_chosen_logps
+            rejected_logratios = rejected_logps - ref_rejected_logps
+            chosen_logratios_sorted, _ = torch.sort(chosen_logratios, dim=0)
+            rejected_logratios_sorted, _ = torch.sort(rejected_logratios, dim=0)
+            delta = chosen_logratios_sorted - rejected_logratios_sorted
+            losses = (
+                -F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * delta) * self.label_smoothing
+            )
+
+        elif loss_type == "aot":
+            logratios = chosen_logps - rejected_logps
+            ref_logratios = ref_chosen_logps - ref_rejected_logps
+            logratios_sorted, _ = torch.sort(logratios, dim=0)
+            ref_logratios_sorted, _ = torch.sort(ref_logratios, dim=0)
+            delta = logratios_sorted - ref_logratios_sorted
+            losses = (
+                -F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * delta) * self.label_smoothing
+            )
+
+        elif loss_type == "apo_zero":
+            # Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are better than your model's default output
+            losses_chosen = 1 - F.sigmoid(
+                self.beta * chosen_logratios
+            )  # Increase chosen likelihood
+            losses_rejected = F.sigmoid(
+                self.beta * rejected_logratios
+            )  # Decrease rejected likelihood
+            losses = losses_chosen + losses_rejected
+
+        elif loss_type == "apo_down":
+            # Eqn (8) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are worse than your model's default output.
+            # Decrease chosen likelihood and decrease rejected likelihood more
+            losses_chosen = F.sigmoid(self.beta * chosen_logratios)
+            losses_rejected = 1 - F.sigmoid(
+                self.beta * (chosen_logratios - rejected_logratios)
+            )
+            losses = losses_chosen + losses_rejected
+
+        elif loss_type == "discopop":
+            # Eqn (5) of the DiscoPOP paper (https://huggingface.co/papers/2406.08414)
+            # This loss was discovered with LLM discovery
+            logratios = chosen_logps - rejected_logps
+            ref_logratios = ref_chosen_logps - ref_rejected_logps
+            logits = logratios - ref_logratios
+            logits = logits * self.beta
+            # Modulate the mixing coefficient based on the log ratio magnitudes
+            log_ratio_modulation = torch.sigmoid(logits / self.args.discopop_tau)
+            logistic_component = -F.logsigmoid(logits)
+            exp_component = torch.exp(-logits)
+            # Blend between logistic and exponential component based on log ratio modulation
+            losses = (
+                logistic_component * (1 - log_ratio_modulation)
+                + exp_component * log_ratio_modulation
+            )
+
+        elif loss_type == "sft":
+            # SFT loss is the negative log likelihood loss on chosen responses
+            # This acts as the generation loss component in MPO
+            sft_loss = model_output["nll_loss"]
+            # Create losses tensor with same shape as other losses (per-sample)
+            batch_size = chosen_logps.shape[0]
+            losses = sft_loss.expand(batch_size)
+            # For SFT, we don't have preference rewards, so use zeros
+            chosen_rewards = torch.zeros_like(chosen_logps)
+            rejected_rewards = torch.zeros_like(rejected_logps)
+
+        else:
+            raise ValueError(
+                f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'exo_pair', "
+                "'nca_pair', 'robust', 'bco_pair', 'sppo_hard', 'aot', 'aot_pair', 'discopop', 'apo_zero', "
+                "'apo_down', 'sft']"
+            )
+
+        chosen_rewards = (
+            self.beta * (chosen_logps.to(device) - ref_chosen_logps.to(device)).detach()
+        )
+        rejected_rewards = (
+            self.beta
+            * (rejected_logps.to(device) - ref_rejected_logps.to(device)).detach()
+        )
+
+        return losses, chosen_rewards, rejected_rewards
+
+    def _compute_loss_liger(
+        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
+    ) -> dict[str, torch.Tensor]:
+        unwrapped_model = self.accelerator.unwrap_model(model)
+        concatenated_batch = self.concatenated_inputs(
+            batch, padding_value=self.pad_token_id
+        )
+
+        model_kwargs = {}
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        # Add the pixel values and attention masks for vision models
+        if "pixel_values" in concatenated_batch:
+            model_kwargs["pixel_values"] = concatenated_batch["pixel_values"]
+        if "pixel_attention_mask" in concatenated_batch:
+            model_kwargs["pixel_attention_mask"] = concatenated_batch[
+                "pixel_attention_mask"
+            ]
+        if "image_sizes" in concatenated_batch:
+            model_kwargs["image_sizes"] = concatenated_batch["image_sizes"]
+
+        prompt_attention_mask = concatenated_batch["prompt_attention_mask"]
+        completion_attention_mask = concatenated_batch["completion_attention_mask"]
+
+        if self.is_encoder_decoder:
+            # 1. Get encoder outputs
+            encoder_outputs = unwrapped_model.get_encoder()(
+                concatenated_batch["prompt_input_ids"],
+                attention_mask=concatenated_batch["prompt_attention_mask"],
+                return_dict=True,
+            )
+            # 2. Prepare decoder inputs
+            decoder_input_ids = shift_tokens_right(
+                concatenated_batch["completion_input_ids"],
+                unwrapped_model.config.decoder_start_token_id,
+            )
+            # 3. Get decoder outputs
+            decoder_outputs = unwrapped_model.get_decoder()(
+                input_ids=decoder_input_ids,
+                attention_mask=concatenated_batch["completion_attention_mask"],
+                encoder_hidden_states=encoder_outputs.last_hidden_state,
+                encoder_attention_mask=concatenated_batch["prompt_attention_mask"],
+                use_cache=False,
+            )
+            hidden_states = decoder_outputs.last_hidden_state
+
+            ref_hidden_states = None
+            if not self.reference_free and self.ref_model is not None:
+                unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model)
+                ref_encoder_outputs = unwrapped_ref_model.get_encoder()(
+                    concatenated_batch["prompt_input_ids"],
+                    attention_mask=concatenated_batch["prompt_attention_mask"],
+                    return_dict=True,
+                )
+                ref_decoder_outputs = unwrapped_ref_model.get_decoder()(
+                    input_ids=decoder_input_ids,
+                    attention_mask=concatenated_batch["completion_attention_mask"],
+                    encoder_hidden_states=ref_encoder_outputs.last_hidden_state,
+                    encoder_attention_mask=concatenated_batch["prompt_attention_mask"],
+                    use_cache=False,
+                )
+                ref_hidden_states = ref_decoder_outputs.last_hidden_state
+            elif not self.reference_free:
+                with self.null_ref_context():
+                    ref_encoder_outputs = unwrapped_model.get_encoder()(
+                        concatenated_batch["prompt_input_ids"],
+                        attention_mask=concatenated_batch["prompt_attention_mask"],
+                        return_dict=True,
+                    )
+                    ref_decoder_outputs = unwrapped_model.get_decoder()(
+                        input_ids=decoder_input_ids,
+                        attention_mask=concatenated_batch["completion_attention_mask"],
+                        encoder_hidden_states=ref_encoder_outputs.last_hidden_state,
+                        encoder_attention_mask=concatenated_batch[
+                            "prompt_attention_mask"
+                        ],
+                        use_cache=False,
+                    )
+                    ref_hidden_states = ref_decoder_outputs.last_hidden_state
+
+            labels = concatenated_batch["completion_input_ids"]
+            loss_mask = completion_attention_mask.bool()
+        else:
+            # For decoder-only models
+            input_ids = torch.cat(
+                (
+                    concatenated_batch["prompt_input_ids"],
+                    concatenated_batch["completion_input_ids"],
+                ),
+                dim=1,
+            )
+            attention_mask = torch.cat(
+                (
+                    concatenated_batch["prompt_attention_mask"],
+                    concatenated_batch["completion_attention_mask"],
+                ),
+                dim=1,
+            )
+            # Mask the prompt but not the completion for the loss
+            loss_mask = torch.cat(
+                (torch.zeros_like(prompt_attention_mask), completion_attention_mask),
+                dim=1,
+            )
+
+            # Flush and truncate
+            if self.max_length is not None and self.max_length < attention_mask.size(1):
+                if self.truncation_mode == "keep_start":
+                    # Flush left to reduce the memory usage
+                    # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+                    #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+                    attention_mask, input_ids, loss_mask = flush_left(
+                        attention_mask, input_ids, loss_mask
+                    )
+                    attention_mask = attention_mask[:, : self.max_length]
+                    input_ids = input_ids[:, : self.max_length]
+                    loss_mask = loss_mask[:, : self.max_length]
+                elif self.truncation_mode == "keep_end":
+                    # Flush right before truncating left, then flush left
+                    # [[0, 0, x, x, x, x],  ->  [[0, 0, x, x],
+                    #  [0, x, x, x, 0, 0]]       [0, x, x, x]]
+                    attention_mask, input_ids, loss_mask = flush_right(
+                        attention_mask, input_ids, loss_mask
+                    )
+                    input_ids = input_ids[:, -self.max_length :]
+                    attention_mask = attention_mask[:, -self.max_length :]
+                    loss_mask = loss_mask[:, -self.max_length :]
+                    attention_mask, input_ids, loss_mask = flush_left(
+                        attention_mask, input_ids, loss_mask
+                    )
+                else:
+                    raise ValueError(
+                        f"Unknown truncation mode: '{self.truncation_mode}'. Should be one of ['keep_end', "
+                        "'keep_start']."
+                    )
+            else:
+                # Flush left to reduce the memory usage
+                # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+                #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+                attention_mask, input_ids, loss_mask = flush_left(
+                    attention_mask, input_ids, loss_mask
+                )
+
+            # Add logits_to_keep optimization
+            if self.use_logits_to_keep:
+                first_compute_index = loss_mask.nonzero(as_tuple=True)[1].min()
+                logits_to_keep = (loss_mask.shape[1] - first_compute_index).item() + 1
+                model_kwargs["logits_to_keep"] = logits_to_keep
+
+            model_kwargs["output_hidden_states"] = True
+
+            # Add padding-free training support
+            if self.padding_free:
+                input_ids = input_ids[attention_mask.bool()].unsqueeze(0)
+                loss_mask = loss_mask[attention_mask.bool()].unsqueeze(0)
+                position_ids = (
+                    attention_mask.cumsum(1)[attention_mask.bool()].unsqueeze(0) - 1
+                )
+                model_kwargs["position_ids"] = position_ids
+            else:
+                model_kwargs["attention_mask"] = attention_mask
+
+            # Get the base model outputs (before LM head)
+            if (
+                hasattr(unwrapped_model, "get_decoder")
+                and unwrapped_model.get_decoder() is not None
+            ):
+                base_model = unwrapped_model.get_decoder()
+            else:
+                base_attr = getattr(
+                    unwrapped_model,
+                    "base_model_prefix",
+                    self.args.base_model_attribute_name,
+                )
+                base_model = getattr(unwrapped_model, base_attr, unwrapped_model)
+
+            outputs = base_model(
+                input_ids,
+                use_cache=False,
+                **model_kwargs,
+            )
+            hidden_states = outputs.last_hidden_state[:, :-1]
+
+            # Get reference hidden states if needed
+            ref_hidden_states = None
+            if not self.reference_free and self.ref_model is not None:
+                unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model)
+                if (
+                    hasattr(unwrapped_ref_model, "get_decoder")
+                    and unwrapped_ref_model.get_decoder() is not None
+                ):
+                    ref_base_model = unwrapped_ref_model.get_decoder()
+                else:
+                    ref_attr = getattr(
+                        unwrapped_ref_model,
+                        "base_model_prefix",
+                        self.args.base_model_attribute_name,
+                    )
+                    ref_base_model = getattr(
+                        unwrapped_ref_model, ref_attr, unwrapped_ref_model
+                    )
+
+                ref_outputs = ref_base_model(
+                    input_ids,
+                    use_cache=False,
+                    **model_kwargs,
+                )
+                ref_hidden_states = ref_outputs.last_hidden_state[:, :-1]
+            elif not self.reference_free:
+                if (
+                    hasattr(unwrapped_model, "get_decoder")
+                    and unwrapped_model.get_decoder() is not None
+                ):
+                    ref_base_model = unwrapped_model.get_decoder()
+                else:
+                    ref_attr = getattr(
+                        unwrapped_model,
+                        "base_model_prefix",
+                        self.args.base_model_attribute_name,
+                    )
+                    ref_base_model = getattr(unwrapped_model, ref_attr, unwrapped_model)
+                with self.null_ref_context():
+                    ref_outputs = ref_base_model(
+                        input_ids,
+                        use_cache=False,
+                        **model_kwargs,
+                    )
+                    ref_hidden_states = ref_outputs.last_hidden_state[:, :-1]
+
+            masked_input_ids = torch.where(
+                loss_mask != 0, input_ids, self.label_pad_token_id
+            )
+            labels = masked_input_ids[:, 1:]  # Shift right for casual LM
+
+        # Get the LM head
+        lm_head = unwrapped_model.get_output_embeddings()
+
+        # Get reference model weights if needed
+        ref_weight = None
+        ref_bias = None
+        if not self.reference_free:
+            if self.ref_model is not None:
+                unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model)
+                ref_lm_head = unwrapped_ref_model.get_output_embeddings()
+            else:
+                with self.null_ref_context():
+                    ref_lm_head = unwrapped_model.get_output_embeddings()
+            ref_weight = ref_lm_head.weight
+            ref_bias = ref_lm_head.bias if hasattr(ref_lm_head, "bias") else None
+
+        # Compute loss using Liger kernel
+        loss_output = self.dpo_loss_fn(
+            lm_head.weight,
+            hidden_states,
+            labels,
+            bias=lm_head.bias if hasattr(lm_head, "bias") else None,
+            ref_input=ref_hidden_states if not self.reference_free else None,
+            ref_weight=ref_weight if not self.reference_free else None,
+            ref_bias=ref_bias if not self.reference_free else None,
+        )
+        (
+            loss,
+            (
+                chosen_logps,
+                rejected_logps,
+                chosen_logits_mean,
+                rejected_logits_mean,
+                nll_loss,
+                *aux_outputs,
+            ),
+        ) = loss_output
+
+        output = {
+            "loss": loss,
+            "chosen_logps": chosen_logps,
+            "rejected_logps": rejected_logps,
+            "mean_chosen_logits": chosen_logits_mean,
+            "mean_rejected_logits": rejected_logits_mean,
+            "nll_loss": nll_loss,
+            "chosen_rewards": aux_outputs[0],
+            "rejected_rewards": aux_outputs[1],
+        }
+        if self.aux_loss_enabled:
+            output["aux_loss"] = outputs.aux_loss
+
+        return output
+
+    def concatenated_forward(
+        self,
+        model: nn.Module,
+        batch: dict[str, list | torch.LongTensor],
+        is_ref_model: bool = False,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Runs the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+
+        Args:
+            model:
+                Model to run the forward pass on.
+            batch:
+                Batch of input data.
+            is_ref_model:
+                Whether this method is being called for the reference model. If `True`, length desensitization is not
+                applied.
+        """
+        num_examples = batch["prompt_input_ids"].shape[0]
+
+        concatenated_batch = self.concatenated_inputs(
+            batch, padding_value=self.pad_token_id
+        )
+
+        model_kwargs = {"use_cache": False}
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        # Add the pixel values and attention masks for vision models
+        if "pixel_values" in concatenated_batch:
+            model_kwargs["pixel_values"] = concatenated_batch["pixel_values"]
+        if "pixel_attention_mask" in concatenated_batch:
+            model_kwargs["pixel_attention_mask"] = concatenated_batch[
+                "pixel_attention_mask"
+            ]
+        if "image_sizes" in concatenated_batch:
+            model_kwargs["image_sizes"] = concatenated_batch["image_sizes"]
+
+        prompt_input_ids = concatenated_batch["prompt_input_ids"]
+        prompt_attention_mask = concatenated_batch["prompt_attention_mask"]
+        completion_input_ids = concatenated_batch["completion_input_ids"]
+        completion_attention_mask = concatenated_batch["completion_attention_mask"]
+        if self.is_encoder_decoder:
+            labels = completion_input_ids
+            labels[completion_attention_mask == 0] = self.label_pad_token_id
+            outputs = model(
+                input_ids=prompt_input_ids,
+                attention_mask=prompt_attention_mask,
+                labels=labels,  # we need the labels for the logits to be returned
+                **model_kwargs,
+            )
+            logits = outputs.logits
+            loss_mask = completion_attention_mask.bool()
+        else:
+            # Concatenate the prompt and completion inputs
+            input_ids = torch.cat((prompt_input_ids, completion_input_ids), dim=1)
+            attention_mask = torch.cat(
+                (prompt_attention_mask, completion_attention_mask), dim=1
+            )
+            if "token_type_ids" in concatenated_batch:
+                prompt_token_type_ids = concatenated_batch["token_type_ids"]
+                token_type_ids = pad_to_length(
+                    prompt_token_type_ids, input_ids.shape[1], 0
+                )
+            # Mask the prompt but not the completion for the loss
+            loss_mask = torch.cat(
+                (torch.zeros_like(prompt_attention_mask), completion_attention_mask),
+                dim=1,
+            )
+
+            # Flush and truncate
+            if self.max_length is not None and self.max_length < attention_mask.size(1):
+                if self.truncation_mode == "keep_start":
+                    # Flush left to reduce the memory usage
+                    # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+                    #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+                    if "token_type_ids" in concatenated_batch:
+                        attention_mask, input_ids, loss_mask, token_type_ids = (
+                            flush_left(
+                                attention_mask, input_ids, loss_mask, token_type_ids
+                            )
+                        )
+                    else:
+                        attention_mask, input_ids, loss_mask = flush_left(
+                            attention_mask, input_ids, loss_mask
+                        )
+                    attention_mask = attention_mask[:, : self.max_length]
+                    input_ids = input_ids[:, : self.max_length]
+                    loss_mask = loss_mask[:, : self.max_length]
+                elif self.truncation_mode == "keep_end":
+                    # Flush right before truncating left, then flush left
+                    # [[0, 0, x, x, x, x],  ->  [[0, 0, x, x],
+                    #  [0, x, x, x, 0, 0]]       [0, x, x, x]]
+                    if "token_type_ids" in concatenated_batch:
+                        attention_mask, input_ids, loss_mask, token_type_ids = (
+                            flush_left(
+                                attention_mask, input_ids, loss_mask, token_type_ids
+                            )
+                        )
+                        token_type_ids = token_type_ids[:, -self.max_length :]
+                    else:
+                        attention_mask, input_ids, loss_mask = flush_right(
+                            attention_mask, input_ids, loss_mask
+                        )
+                    input_ids = input_ids[:, -self.max_length :]
+                    attention_mask = attention_mask[:, -self.max_length :]
+                    loss_mask = loss_mask[:, -self.max_length :]
+                    if "token_type_ids" in concatenated_batch:
+                        attention_mask, input_ids, loss_mask, token_type_ids = (
+                            flush_left(
+                                attention_mask, input_ids, loss_mask, token_type_ids
+                            )
+                        )
+                    else:
+                        attention_mask, input_ids, loss_mask = flush_left(
+                            attention_mask, input_ids, loss_mask
+                        )
+                else:
+                    raise ValueError(
+                        f"Unknown truncation mode: '{self.truncation_mode}'. Should be one of ['keep_end', "
+                        "'keep_start']."
+                    )
+            # Flush left to reduce the memory usage
+            # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+            #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+            elif "token_type_ids" in concatenated_batch:
+                attention_mask, input_ids, loss_mask, token_type_ids = flush_left(
+                    attention_mask, input_ids, loss_mask, token_type_ids
+                )
+            else:
+                attention_mask, input_ids, loss_mask = flush_left(
+                    attention_mask, input_ids, loss_mask
+                )
+
+            if "token_type_ids" in concatenated_batch:
+                model_kwargs["token_type_ids"] = token_type_ids
+
+            if self.use_logits_to_keep:
+                # Compute logits_to_keep based on loss_mask pattern:
+                # [[0, 0, 0, x, x, x, x],
+                #  [0, 0, 0, x, x, x, 0]]
+                #         ^ start computing logits from here ([:, -(7-3+1):])
+                first_compute_index = loss_mask.nonzero(as_tuple=True)[1].min()
+                logits_to_keep = (
+                    loss_mask.shape[1] - first_compute_index
+                ).item() + 1  # +1 for the first label
+                model_kwargs["logits_to_keep"] = logits_to_keep
+
+            model_kwargs["output_hidden_states"] = True
+
+            if self.padding_free:
+                # Flatten the input_ids, position_ids, and loss_mask
+                # input_ids = [[a, b, c, 0], ->     input_ids = [[a, b, c, d, e, f, g]]
+                #              [d, e, f, g]]     position_ids = [[0, 1, 2, 0, 1, 2, 3]]
+                input_ids = input_ids[attention_mask.bool()].unsqueeze(0)
+                loss_mask = loss_mask[attention_mask.bool()].unsqueeze(0)
+                position_ids = (
+                    attention_mask.cumsum(1)[attention_mask.bool()].unsqueeze(0) - 1
+                )
+                model_kwargs["position_ids"] = position_ids
+            else:
+                model_kwargs["attention_mask"] = attention_mask
+
+            outputs = model(input_ids, **model_kwargs)
+            logits = outputs.logits
+
+            # Offset the logits by one to align with the labels
+            labels = torch.roll(input_ids, shifts=-1, dims=1)
+            loss_mask = torch.roll(loss_mask, shifts=-1, dims=1).bool()
+
+            if self.use_logits_to_keep:
+                # Align labels with logits
+                # logits:    -,  -, [x2, x3, x4, x5, x6]
+                #                     ^ --------- ^       after logits[:, :-1, :]
+                # labels:   [y0, y1, y2, y3, y4, y5, y6]
+                #                         ^ --------- ^   with logits_to_keep=4, [:, -4:]
+                # loss_mask: [0,  0,  0,  1,  1,  1,  1]
+                labels = labels[:, -logits_to_keep:]
+                loss_mask = loss_mask[:, -logits_to_keep:]
+
+        if logits.shape[:2] != labels.shape[:2]:
+            # for LLaVA, the returned logits include the image tokens (placed before the text tokens)
+            seq_len = labels.shape[1]
+            logits = logits[:, -seq_len:]
+
+        # Compute the log probabilities of the labels
+        labels[~loss_mask] = (
+            0  # dummy token; we'll ignore the losses on these tokens later
+        )
+        per_token_logps = selective_log_softmax(logits, labels)
+        per_token_logps[~loss_mask] = 0
+        per_token_logps = torch.roll(per_token_logps, shifts=1, dims=1)
+
+        if self.padding_free:
+            # Unflatten the per_token_logps (shape: [1, sum_seq_len] -> [batch_size, seq_len])
+            batch_size, seq_len = attention_mask.shape
+            per_token_logps_ = torch.zeros(
+                batch_size,
+                seq_len,
+                device=outputs.logits.device,
+                dtype=outputs.logits.dtype,
+            )
+            per_token_logps_[attention_mask.bool()] = per_token_logps
+            per_token_logps = per_token_logps_
+
+        all_logps = per_token_logps[:, 1:].sum(-1)
+
+        output = {}
+
+        if self.use_weighting:
+            with torch.no_grad():
+                # Eq (2) of the WPO paper: https://huggingface.co/papers/2406.11827
+                logprobs = F.log_softmax(logits, dim=-1)
+                weights_adjustment_factor = torch.logsumexp(
+                    2 * logprobs, dim=-1
+                )  # same as sum(probs**2) in log space
+                per_token_logps_adjusted = per_token_logps - weights_adjustment_factor
+                all_weights = (per_token_logps_adjusted * loss_mask).sum(
+                    -1
+                ) / loss_mask.sum(-1)
+                chosen_weights = all_weights[:num_examples]
+                rejected_weights = all_weights[num_examples:]
+                output["policy_weights"] = torch.clamp(
+                    torch.exp(chosen_weights + rejected_weights), max=1
+                )
+
+        if self.args.rpo_alpha is not None or "sft" in self.loss_type:
+            # Only use the chosen logits for the RPO loss or SFT loss
+            chosen_logits = (
+                logits[:num_examples, :-1]
+                if not self.is_encoder_decoder
+                else logits[:num_examples]
+            )
+            chosen_labels = (
+                labels[:num_examples, :-1]
+                if not self.is_encoder_decoder
+                else labels[:num_examples]
+            )
+
+            # Compute the log probabilities of the labels
+            output["nll_loss"] = F.cross_entropy(
+                torch.flatten(chosen_logits, end_dim=1),
+                torch.flatten(chosen_labels, end_dim=1),
+                ignore_index=0,
+            )
+
+        if "ipo" in self.loss_type:
+            all_logps = all_logps / loss_mask.sum(-1)
+
+        if self.args.ld_alpha is not None and not is_ref_model:
+            # Compute response lengths based on loss_mask
+            completion_lengths = loss_mask.sum(dim=1)
+
+            chosen_lengths = completion_lengths[:num_examples]
+            rejected_lengths = completion_lengths[num_examples:]
+            public_lengths = torch.min(
+                chosen_lengths, rejected_lengths
+            )  # l_p in the paper
+            public_lengths = torch.cat([public_lengths, public_lengths], dim=0)
+
+            seq_len = per_token_logps.size(1)
+            position_ids = torch.arange(
+                seq_len, device=per_token_logps.device
+            ).expand_as(per_token_logps)
+
+            ld_mask = position_ids < public_lengths.unsqueeze(1)
+            mask = position_ids < completion_lengths.unsqueeze(1)
+
+            front_mask = (ld_mask & mask).float()
+            rear_mask = (~ld_mask & mask).float()
+            front_logps = (per_token_logps * front_mask).sum(dim=1)
+            rear_logps = (per_token_logps * rear_mask).sum(dim=1)
+
+            all_logps = front_logps + self.args.ld_alpha * rear_logps
+
+        output["chosen_logps"] = all_logps[:num_examples]
+        output["rejected_logps"] = all_logps[num_examples:]
+
+        # Compute the mean logits
+        if self.padding_free:
+            # position_ids contains a sequence of range identifiers (e.g., [[0, 1, 2, 0, 1, 2, 3, ...]]).
+            # There are 2*num_examples ranges in total: the first half corresponds to the chosen tokens,
+            # and the second half to the rejected tokens.
+            # To find the start of the rejected tokens, we look for the num_examples+1-th zero in pos_id.
+            split_idx = (position_ids == 0).nonzero(as_tuple=True)[1][num_examples]
+            mean_chosen_logits = logits[0, :split_idx][loss_mask[0, :split_idx]].mean()
+            mean_rejected_logits = logits[0, split_idx:][
+                loss_mask[0, split_idx:]
+            ].mean()
+        else:
+            mean_chosen_logits = logits[:num_examples][loss_mask[:num_examples]].mean()
+            mean_rejected_logits = logits[num_examples:][
+                loss_mask[num_examples:]
+            ].mean()
+
+        output["mean_chosen_logits"] = mean_chosen_logits
+        output["mean_rejected_logits"] = mean_rejected_logits
+
+        if self.aux_loss_enabled:
+            output["aux_loss"] = outputs.aux_loss
+
+        return output
+
+    def get_batch_loss_metrics(
+        self,
+        model: PreTrainedModel | nn.Module,
+        batch: dict[str, list | torch.LongTensor],
+        train_eval: Literal["train", "eval"] = "train",
+    ) -> tuple[torch.Tensor, dict[str, float]]:
+        """Compute the DPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+
+        if self.args.use_liger_kernel:
+            model_output = self._compute_loss_liger(model, batch)
+            losses = model_output["loss"]
+            chosen_rewards = model_output["chosen_rewards"]
+            rejected_rewards = model_output["rejected_rewards"]
+        else:
+            model_output = self.concatenated_forward(model, batch)
+
+            # if ref_chosen_logps and ref_rejected_logps in batch use them, otherwise use the reference model
+            if "ref_chosen_logps" in batch and "ref_rejected_logps" in batch:
+                ref_chosen_logps = batch["ref_chosen_logps"]
+                ref_rejected_logps = batch["ref_rejected_logps"]
+            else:
+                ref_chosen_logps, ref_rejected_logps = self.compute_ref_log_probs(batch)
+
+            # Initialize combined losses
+            losses = 0
+            chosen_rewards = 0
+            rejected_rewards = 0
+
+            # Compute losses for each loss type
+            for idx, loss_type in enumerate(self.loss_type):
+                # Compute individual loss using standard DPO loss function
+                _losses, _chosen_rewards, _rejected_rewards = self.dpo_loss(
+                    model_output["chosen_logps"],
+                    model_output["rejected_logps"],
+                    ref_chosen_logps,
+                    ref_rejected_logps,
+                    loss_type,
+                    model_output,
+                )
+
+                # Add weighted contributions
+                weight = self.loss_weights[idx] if self.loss_weights else 1.0
+                losses = losses + _losses * weight
+                chosen_rewards = chosen_rewards + _chosen_rewards * weight
+                rejected_rewards = rejected_rewards + _rejected_rewards * weight
+
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        if self.args.rpo_alpha is not None:
+            losses = (
+                losses + self.args.rpo_alpha * model_output["nll_loss"]
+            )  # RPO loss from V3 of the paper
+
+        if self.use_weighting:
+            losses = losses * model_output["policy_weights"]
+
+        if self.aux_loss_enabled:
+            losses = losses + self.aux_loss_coef * model_output["aux_loss"]
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards).mean().item()
+        )
+        metrics[f"{prefix}rewards/rejected"] = (
+            self.accelerator.gather_for_metrics(rejected_rewards).mean().item()
+        )
+        metrics[f"{prefix}rewards/accuracies"] = (
+            self.accelerator.gather_for_metrics(reward_accuracies).mean().item()
+        )
+        metrics[f"{prefix}rewards/margins"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards)
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logps/chosen"] = (
+            self.accelerator.gather_for_metrics(model_output["chosen_logps"])
+            .detach()
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logps/rejected"] = (
+            self.accelerator.gather_for_metrics(model_output["rejected_logps"])
+            .detach()
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logits/chosen"] = (
+            self.accelerator.gather_for_metrics(model_output["mean_chosen_logits"])
+            .detach()
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logits/rejected"] = (
+            self.accelerator.gather_for_metrics(model_output["mean_rejected_logits"])
+            .detach()
+            .mean()
+            .item()
+        )
+        if self.args.rpo_alpha is not None or "sft" in self.loss_type:
+            metrics[f"{prefix}nll_loss"] = (
+                self.accelerator.gather_for_metrics(model_output["nll_loss"])
+                .detach()
+                .mean()
+                .item()
+            )
+        if self.aux_loss_enabled:
+            metrics[f"{prefix}aux_loss"] = (
+                self.accelerator.gather_for_metrics(model_output["aux_loss"])
+                .detach()
+                .mean()
+                .item()
+            )
+
+        return losses.mean(), metrics
+
+    def compute_loss(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, float]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(
+                model, inputs, train_eval="train"
+            )
+
+        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
+        loss = loss.to(self.args.device)
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return loss, metrics
+
+        return loss
+
+    def generate_from_model_and_ref(
+        self, model, batch: dict[str, torch.LongTensor]
+    ) -> tuple[str, str]:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.pad_token_id,
+            )
+
+            # if ref_output in batch use that otherwise use the reference model
+            if "ref_output" in batch:
+                ref_output = batch["ref_output"]
+            elif self.ref_model is None:
+                with self.null_ref_context():
+                    ref_output = self.model.generate(
+                        input_ids=batch["prompt_input_ids"],
+                        attention_mask=batch["prompt_attention_mask"],
+                        max_length=self.max_length,
+                        do_sample=True,
+                        pad_token_id=self.pad_token_id,
+                    )
+            else:
+                ref_output = self.ref_model.generate(
+                    input_ids=batch["prompt_input_ids"],
+                    attention_mask=batch["prompt_attention_mask"],
+                    max_length=self.max_length,
+                    do_sample=True,
+                    pad_token_id=self.pad_token_id,
+                )
+
+        policy_output = pad_to_length(policy_output, self.max_length, self.pad_token_id)
+        policy_output_decoded = self.processing_class.batch_decode(
+            policy_output, skip_special_tokens=True
+        )
+
+        ref_output = pad_to_length(ref_output, self.max_length, self.pad_token_id)
+        ref_output_decoded = self.processing_class.batch_decode(
+            ref_output, skip_special_tokens=True
+        )
+
+        return policy_output_decoded, ref_output_decoded
+
+    def prediction_step(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        prediction_loss_only: bool,
+        ignore_keys: list[str] | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(
+                model, inputs, train_eval="eval"
+            )
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return loss.detach(), None, None
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def store_metrics(
+        self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train"
+    ) -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: bool | None = None,
+        ignore_keys: list[str] | None = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(
+                range(num_samples), k=self.args.eval_batch_size
+            )
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded, ref_output_decoded = (
+                self.generate_from_model_and_ref(self.model, random_batch)
+            )
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy", "Ref Model"],
+                data=[
+                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
+                    for prompt, pol, ref in zip(
+                        random_batch_dataset["prompt"],
+                        policy_output_decoded,
+                        ref_output_decoded,
+                        strict=True,
+                    )
+                ],
+            )
+            if "wandb" in self.args.report_to and self.accelerator.is_main_process:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+            if "mlflow" in self.args.report_to and self.accelerator.is_main_process:
+                mlflow.log_table(data=table, artifact_file="game_log.json")
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader,
+            description,
+            prediction_loss_only,
+            ignore_keys,
+            metric_key_prefix,
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float`, *optional*):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = torch.tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/trainer/gkd_config.py b/src/aixpert/training/training/trl/trainer/gkd_config.py
new file mode 100644
index 0000000..af8635e
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/gkd_config.py
@@ -0,0 +1,116 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+from .sft_config import SFTConfig
+
+
+@dataclass
+class GKDConfig(SFTConfig):
+    """
+    Configuration class for [`GKDTrainer`].
+
+    This class includes only the parameters that are specific to GKD training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] and [`SFTConfig`] documentation.
+
+    Args:
+        temperature (`float`, *optional*, defaults to `0.9`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        lmbda (`float`, *optional*, defaults to `0.5`):
+            Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy
+            student-generated outputs).
+        beta (`float`, *optional*, defaults to `0.5`):
+            Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence loss. When
+            beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence.
+        max_new_tokens (`int`, *optional*, defaults to `128`):
+            Maximum number of tokens to generate per completion.
+        teacher_model_name_or_path (`str`, *optional*):
+            Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being
+            trained.
+        teacher_model_init_kwargs (`dict[str, Any]]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
+            from a string.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        seq_kd (`bool`, *optional*, defaults to `False`):
+            Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised FT on
+            teacher-generated output).
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + [
+        "teacher_model_init_kwargs"
+    ]
+
+    temperature: float = field(
+        default=0.9,
+        metadata={
+            "help": "Temperature for sampling. The higher the temperature, the more random the completions."
+        },
+    )
+    lmbda: float = field(
+        default=0.5,
+        metadata={
+            "help": "Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy "
+            "student-generated outputs)."
+        },
+    )
+    beta: float = field(
+        default=0.5,
+        metadata={
+            "help": "Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence "
+            "loss. When beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL "
+            "Divergence."
+        },
+    )
+    max_new_tokens: int = field(
+        default=128,
+        metadata={"help": "Maximum number of tokens to generate per completion."},
+    )
+    teacher_model_name_or_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "Model name or path of the teacher model. If `None`, the teacher model will be the same as the "
+            "model being trained."
+        },
+    )
+    teacher_model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "teacher model from a string."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropouts in `model`."},
+    )
+    seq_kd: bool = field(
+        default=False,
+        metadata={
+            "help": "Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised "
+            "FT on teacher-generated output)."
+        },
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+        # check lmbda and beta are in the range [0, 1]
+        if self.lmbda < 0.0 or self.lmbda > 1.0:
+            raise ValueError("lmbda must be in the range [0.0, 1.0].")
+        if self.beta < 0.0 or self.beta > 1.0:
+            raise ValueError("beta must be in the range [0.0, 1.0].")
diff --git a/src/aixpert/training/training/trl/trainer/gkd_trainer.py b/src/aixpert/training/training/trl/trainer/gkd_trainer.py
new file mode 100644
index 0000000..e0cce2a
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/gkd_trainer.py
@@ -0,0 +1,519 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import textwrap
+import warnings
+from collections.abc import Callable
+from typing import Any
+
+import torch
+import torch.nn.functional as F
+from datasets import Dataset
+from torch import nn
+from transformers import (
+    AutoModelForCausalLM,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+)
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalPrediction
+from transformers.utils import is_liger_kernel_available, is_peft_available
+
+from ..models import prepare_deepspeed
+from ..models.utils import unwrap_model_for_generation
+from .gkd_config import GKDConfig
+from .sft_trainer import SFTTrainer
+from .utils import DataCollatorForChatML, disable_dropout_in_model, empty_cache
+
+
+if is_peft_available():
+    from peft import PeftConfig
+
+if is_liger_kernel_available():
+    from liger_kernel.chunked_loss import LigerFusedLinearJSDLoss
+
+
+class GKDTrainer(SFTTrainer):
+    """Trainer for Generalized Knowledge Distillation (GKD) of language models.
+
+    For details on GKD, see the paper: [On-Policy Distillation of Language Models: Learning from Self-Generated
+    Mistakes](https://huggingface.co/papers/2306.13649).
+
+    Args:
+        model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
+            Model to be trained, or the string identifier of the model to be instantiated from a pretrained model.
+        teacher_model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
+            Teacher model for knowledge distillation, or the string identifier of the model to be instantiated from a
+            pretrained model.
+        args ([`GKDConfig`], *optional*):
+            Training arguments.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            Data collator to batch samples from the dataset. It defaults to a [`DataCollatorForChatML`] using the
+            `processing_class`.
+        train_dataset ([`~datasets.Dataset`], *optional*):
+            Dataset for training.
+        eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
+            Dataset for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+           Class to process the data.
+        compute_metrics (`Callable`, *optional*):
+            Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
+            dictionary string to float.
+        callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
+            Callbacks to use during training.
+        optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
+            Tuple containing the optimizer and the learning rate scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable`, *optional*):
+            Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
+            return the logits to be used for metrics computation.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
+            wrapped with the specified PEFT adapter.
+        formatting_func (`Callable`, *optional*):
+            Function to format the dataset. Must take in an example and return an example.
+    """
+
+    _tag_names = ["trl", "gkd"]
+    _name = "GKD"
+    _paper = {
+        "title": "On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes",
+        "id": "2306.13649",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{agarwal2024on-policy,
+                title        = {{On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes}},
+                author       = {Rishabh Agarwal and Nino Vieillard and Yongchao Zhou and Piotr Stanczyk and Sabela Ramos Garea and Matthieu Geist and Olivier Bachem},
+                year         = 2024,
+                booktitle    = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
+                publisher    = {OpenReview.net},
+                url          = {https://openreview.net/forum?id=3zKtaqxLhW},
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str | None = None,
+        teacher_model: PreTrainedModel | nn.Module | str = None,
+        args: GKDConfig | None = None,
+        data_collator: DataCollator | None = None,  # type: ignore
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+        peft_config: "PeftConfig | None" = None,
+        formatting_func: Callable | None = None,
+    ):
+        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+            warnings.warn(
+                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
+                "it and want it to remain, please share your comments here: "
+                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
+                "TRL_EXPERIMENTAL_SILENCE=1."
+            )
+        # Ensure Trainer does not drop non-signature columns used by the collator (e.g., "prompts")
+        args.remove_unused_columns = False
+        # Respect a user-provided data_collator; otherwise, provide a ChatML collator that
+        if data_collator is None:
+            data_collator = DataCollatorForChatML(
+                tokenizer=processing_class, max_length=args.max_length
+            )
+
+        # Ensure SFTTrainer does not pre-process the dataset when using a ChatML collator,
+        # so that raw conversational fields (e.g., "messages") remain available to the collator.
+        if args.dataset_kwargs is None:
+            args.dataset_kwargs = {"skip_prepare_dataset": True}
+        else:
+            args.dataset_kwargs["skip_prepare_dataset"] = True
+
+        # Liger fused GKD loss (JSD)
+        self.use_liger_gkd_loss = False
+        if args.use_liger_kernel:
+            self.liger_jsd_loss = LigerFusedLinearJSDLoss(
+                beta=args.beta,
+                ignore_index=-100,
+                temperature=args.temperature,
+                compiled=False,
+            )
+            self.use_liger_gkd_loss = True
+
+        super().__init__(
+            model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+            peft_config=peft_config,
+            formatting_func=formatting_func,
+        )
+
+        if args.teacher_model_init_kwargs is None:
+            teacher_model_init_kwargs = {}
+        elif not isinstance(teacher_model, str):
+            raise ValueError(
+                "You passed teacher_model_init_kwargs to the GKDConfig, but your teacher_model is already instantiated."
+            )
+        else:
+            teacher_model_init_kwargs = args.teacher_model_init_kwargs
+            teacher_model_init_kwargs["dtype"] = (
+                teacher_model_init_kwargs["dtype"]
+                if teacher_model_init_kwargs["dtype"] in ["auto", None]
+                else getattr(torch, teacher_model_init_kwargs["dtype"])
+            )
+
+        if isinstance(teacher_model, str):
+            teacher_model = AutoModelForCausalLM.from_pretrained(
+                teacher_model, **teacher_model_init_kwargs
+            )
+
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(self.model)
+
+        if self.is_deepspeed_enabled:
+            self.teacher_model = prepare_deepspeed(teacher_model, self.accelerator)
+        else:
+            self.teacher_model = self.accelerator.prepare_model(
+                teacher_model, evaluation_mode=True
+            )
+
+        self.lmbda = args.lmbda
+        self.beta = args.beta
+        self.temperature = args.temperature
+        self.seq_kd = args.seq_kd
+
+        self.generation_config = GenerationConfig(
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            do_sample=True,
+            top_k=0,
+            use_cache=False if args.gradient_checkpointing else True,
+            pad_token_id=self.processing_class.pad_token_id,
+        )
+        # Set custom EOS tokens if they are specified by the model's generation
+        # config. This is important for models with the Llama 3 chat template,
+        # which use special tokens <|eot_id|> and <|eom_id|> to mark the end of
+        # turns or messages.
+        if (
+            hasattr(self.model.generation_config, "eos_token_id")
+            and self.model.generation_config.eos_token_id is not None
+        ):
+            self.generation_config.eos_token_id = (
+                self.model.generation_config.eos_token_id
+            )
+
+    @staticmethod
+    def generalized_jsd_loss(
+        student_logits,
+        teacher_logits,
+        labels=None,
+        beta=0.5,
+        temperature=1.0,
+        reduction="batchmean",
+    ):
+        """
+        Compute the generalized Jensen-Shannon Divergence loss for knowledge distillation using F.kl_div. See Eq. (1)
+        of https://huggingface.co/papers/2306.13649 for the definition.
+
+        Args:
+            student_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            teacher_logits:
+                Tensor of shape (batch_size, sequence_length, vocab_size)
+            labels:
+                Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing
+                loss
+            beta:
+                Interpolation coefficient between 0 and 1 (default: 0.5)
+            temperature:
+                Softmax temperature (default: 1.0)
+            reduction:
+                Specifies the reduction to apply to the output (default: 'batchmean')
+
+        Returns
+        -------
+            loss: Scalar tensor with the generalized JSD loss
+        """
+        # Apply temperature scaling
+        student_logits = student_logits / temperature
+        teacher_logits = teacher_logits / temperature
+
+        # Compute log probabilities for student and probabilities for teacher
+        student_log_probs = F.log_softmax(student_logits, dim=-1)
+        teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
+
+        if beta == 0:
+            jsd = F.kl_div(
+                student_log_probs, teacher_log_probs, reduction="none", log_target=True
+            )
+        elif beta == 1:
+            jsd = F.kl_div(
+                teacher_log_probs, student_log_probs, reduction="none", log_target=True
+            )
+        else:
+            # Compute the log of the mixture distribution
+            # log(a + b) = log(exp(log(a)) + exp(log(b))) -> for mixture
+            beta = torch.tensor(beta, dtype=student_log_probs.dtype)
+            mixture_log_probs = torch.logsumexp(
+                torch.stack(
+                    [
+                        student_log_probs + torch.log(1 - beta),
+                        teacher_log_probs + torch.log(beta),
+                    ]
+                ),
+                dim=0,
+            )
+
+            # Compute KL divergences using F.kl_div
+            # PyTorch differs from the standard mathematical definition, so the order of the probability distributions is swapped compared to that defined in the paper.
+            kl_teacher = F.kl_div(
+                mixture_log_probs, teacher_log_probs, reduction="none", log_target=True
+            )
+            kl_student = F.kl_div(
+                mixture_log_probs, student_log_probs, reduction="none", log_target=True
+            )
+
+            # Compute the Generalized Jensen-Shannon Divergence
+            jsd = beta * kl_teacher + (1 - beta) * kl_student
+
+        # Masking
+        if labels is not None:
+            mask = labels != -100
+            jsd = jsd[mask]
+
+        # Apply reduction
+        if reduction == "batchmean":
+            return (
+                jsd.sum() / mask.sum()
+                if labels is not None
+                else jsd.sum() / jsd.size(0)
+            )
+        if reduction == "sum":
+            return jsd.sum()
+        if reduction == "mean":
+            return jsd.mean()
+        return jsd
+
+    def compute_loss(
+        self, model, inputs, return_outputs=False, num_items_in_batch=None
+    ):
+        if self.use_liger_gkd_loss:
+            # Forward only through the base models (avoid lm_head to save memory)
+            unwrapped_student = self.accelerator.unwrap_model(model)
+            if (
+                hasattr(unwrapped_student, "get_decoder")
+                and unwrapped_student.get_decoder() is not None
+            ):
+                base_student = unwrapped_student.get_decoder()
+            else:
+                base_student = getattr(
+                    unwrapped_student,
+                    getattr(unwrapped_student, "base_model_prefix", "model"),
+                    unwrapped_student,
+                )
+
+            student_outputs = base_student(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                use_cache=False,
+            )
+
+            self.teacher_model.eval()
+            unwrapped_teacher = self.accelerator.unwrap_model(self.teacher_model)
+            if (
+                hasattr(unwrapped_teacher, "get_decoder")
+                and unwrapped_teacher.get_decoder() is not None
+            ):
+                base_teacher = unwrapped_teacher.get_decoder()
+            else:
+                base_teacher = getattr(
+                    unwrapped_teacher,
+                    getattr(unwrapped_teacher, "base_model_prefix", "model"),
+                    unwrapped_teacher,
+                )
+            with torch.no_grad():
+                teacher_outputs = base_teacher(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                    use_cache=False,
+                )
+
+            # hidden states (shifted)
+            student_hidden = student_outputs.last_hidden_state[:, :-1]
+            teacher_hidden = teacher_outputs.last_hidden_state[:, :-1]
+
+            # Release full outputs to free memory
+            del student_outputs, teacher_outputs
+
+            # labels mask and labels (shifted)
+            labels_mask = inputs["labels"] != -100
+            masked_input_ids = torch.where(
+                labels_mask,
+                inputs["input_ids"],
+                torch.full_like(inputs["input_ids"], -100),
+            )
+            true_labels = masked_input_ids[:, 1:].contiguous()
+
+            # Release intermediate tensors
+            del labels_mask, masked_input_ids
+
+            # heads
+            student_head = unwrapped_student.get_output_embeddings()
+            teacher_head = unwrapped_teacher.get_output_embeddings()
+
+            # liger fused jsd loss
+            loss = self.liger_jsd_loss(
+                student_input=student_hidden,
+                student_weight=student_head.weight,
+                teacher_input=teacher_hidden,
+                teacher_weight=teacher_head.weight,
+                true_labels=true_labels,
+                student_bias=getattr(student_head, "bias", None),
+                teacher_bias=getattr(teacher_head, "bias", None),
+            )
+
+            # Release hidden states after loss computation
+            del student_hidden, teacher_hidden, true_labels
+        else:
+            # compute student output
+            student_outputs = model(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+            )
+
+            # compute teacher output in eval mode
+            self.teacher_model.eval()
+            with torch.no_grad():
+                teacher_outputs = self.teacher_model(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
+                )
+
+            # slice the logits for the generated tokens using the inputs["prompts"] lengths
+            prompt_lengths = inputs["prompts"].shape[1]
+            shifted_student_logits = student_outputs.logits[
+                :, prompt_lengths - 1 : -1, :
+            ]
+            shifted_teacher_logits = teacher_outputs.logits[
+                :, prompt_lengths - 1 : -1, :
+            ]
+            shifted_labels = inputs["labels"][:, prompt_lengths:]
+
+            # compute loss
+            loss = self.generalized_jsd_loss(
+                student_logits=shifted_student_logits,
+                teacher_logits=shifted_teacher_logits,
+                labels=shifted_labels,
+                beta=self.beta,
+            )
+
+        # empty cache
+        empty_cache()
+
+        # Return loss
+        return (loss, student_outputs) if return_outputs else loss
+
+    @staticmethod
+    def generate_on_policy_outputs(model, inputs, generation_config, pad_token_id=None):
+        # Generate output with respect to the prompt-only
+        generated_outputs = model.generate(
+            input_ids=inputs["prompts"],
+            attention_mask=inputs.get("prompt_attention_mask", None),
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+        )
+
+        # Get the generated token IDs
+        generated_tokens = generated_outputs.sequences
+        # Calculate new attention mask
+        new_attention_mask = torch.ones_like(generated_tokens)
+        new_labels = generated_tokens.clone()
+
+        # If there's pad_token_id, set attention mask to 0 for padding tokens
+        if pad_token_id is not None:
+            new_labels[new_labels == pad_token_id] = -100
+            new_attention_mask[generated_tokens == pad_token_id] = 0
+
+        return generated_tokens, new_attention_mask, new_labels
+
+    def training_step(
+        self,
+        model: nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        num_items_in_batch: int | None = None,
+    ) -> torch.Tensor:
+        """
+        Perform a training step for the Generalized Knowledge Distillation (GKD) model.
+
+        This method implements the on-policy learning approach described in the GKD paper. With probability
+        `self.lmbda`, it generates new responses using the student model, which are then used for training instead of
+        the original inputs.
+        """
+        if self.seq_kd:
+            with unwrap_model_for_generation(
+                self.teacher_model, self.accelerator
+            ) as unwrapped_model:
+                new_input_ids, new_attention_mask, new_labels = (
+                    self.generate_on_policy_outputs(
+                        unwrapped_model,
+                        inputs,
+                        self.generation_config,
+                        self.processing_class.pad_token_id,
+                    )
+                )
+            inputs["input_ids"] = new_input_ids
+            inputs["attention_mask"] = new_attention_mask
+            inputs["labels"] = new_labels
+        if random.random() <= self.lmbda:
+            with unwrap_model_for_generation(
+                model, self.accelerator
+            ) as unwrapped_model:
+                new_input_ids, new_attention_mask, new_labels = (
+                    self.generate_on_policy_outputs(
+                        unwrapped_model,
+                        inputs,
+                        self.generation_config,
+                        self.processing_class.pad_token_id,
+                    )
+                )
+            inputs["input_ids"] = new_input_ids
+            inputs["attention_mask"] = new_attention_mask
+            inputs["labels"] = new_labels
+
+        loss = super().training_step(model, inputs, num_items_in_batch)
+        return loss
diff --git a/src/aixpert/training/training/trl/trainer/grpo_config.py b/src/aixpert/training/training/trl/trainer/grpo_config.py
new file mode 100644
index 0000000..f0c13e3
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/grpo_config.py
@@ -0,0 +1,780 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass, field
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class GRPOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`GRPOTrainer`].
+
+    This class includes only the parameters that are specific to GRPO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        > Parameters that control the model and reference model
+
+        model_init_kwargs (`str`, `dict[str, Any]`, *optional*):
+            Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
+            argument of the [`GRPOTrainer`] is provided as a string.
+        disable_dropout (`bool`, *optional*, defaults to `False`):
+            Whether to disable dropout in the model. This is useful for training with a reference model, as it prevents
+            the model from generating different logprobs for the same input.
+        cast_lm_head_to_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether to cast the language modeling head of the policy and reference models to float32. As recommended by
+            the [ScaleRL](https://huggingface.co/papers/2510.13786) recipe. This flag is only supported when the model
+            has untied word embedding and language modeling head layers i.e. `tie_word_embeddings` in the model config
+            is False.
+
+        > Parameters that control the data preprocessing
+        remove_unused_columns (`bool`, *optional*, defaults to `False`):
+            Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that
+            requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left.
+        num_generations (`int` or `None`, *optional*, defaults to `8`):
+            Number of generations per prompt to sample. The effective batch size (num_processes * per_device_batch_size
+            * gradient_accumulation_steps) must be evenly divisible by this value.
+        max_completion_length (`int` or `None`, *optional*, defaults to `256`):
+            Maximum length of the generated completion.
+        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
+            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+            improving generation speed. However, disabling this option allows training models that exceed the VRAM
+            capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
+            with vLLM generation.
+        shuffle_dataset (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the training dataset.
+
+        > Parameters that control generation
+
+        generation_batch_size: (`int`, *optional*):
+            Batch size to use for generation. If `None`, it defaults to the effective training batch size:
+            `per_device_train_batch_size * num_processes * steps_per_generation`. In other words, there is one
+            generation batch processed per optimization step. Mutually exclusive with `steps_per_generation`.
+        steps_per_generation: (`int`, *optional*):
+            Number of steps per generation. If `None`, it defaults to `gradient_accumulation_steps`. Mutually exclusive
+            with `generation_batch_size`.
+        temperature (`float`, defaults to `1.0`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        top_p (`float`, *optional*, defaults to `1.0`):
+            Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
+            `1.0` to consider all tokens.
+        top_k (`int`, *optional*):
+            Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is
+            disabled and all tokens are considered.
+        min_p (`float`, *optional*):
+            Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
+            value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range.
+        generation_kwargs (`dict[str, Any]`, *optional*):
+            Additional keyword arguments to pass to [`~transformers.GenerationConfig`] (if using transformers) or
+            `SamplingParams` (if using vLLM) when sampling completions. This can be used to further customize the
+            generation behavior, such as setting `suppress_tokens`, `num_beams`, etc. If it contains keys that conflict
+            with the other generation parameters (like `min_p`, `top_p`, etc.), they will override them.
+        chat_template_kwargs (`dict[str, Any]`, *optional*):
+            Additional keyword arguments to pass to the `apply_chat_template` function when generating completions.
+        repetition_penalty (`float`, *optional*, defaults to `1.0`):
+            Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
+            Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat
+            tokens.
+        use_transformers_paged (`bool`, *optional*, defaults to `False`):
+            Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers`
+            paged implementation will be used for generation instead of the default padded implementation. This
+            parameter is only effective when `use_vllm` is set to `False`.
+        cache_implementation (`str`, *optional*):
+            Implementation of the cache method for faster generation when `use_vllm` is set to `False`.
+
+        > Parameters that control generation acceleration powered by vLLM
+
+        use_vllm (`bool`, *optional*, defaults to `False`):
+            Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation
+            instead of the default model.generate(). Requires `vllm` to be installed.
+        vllm_mode (`str`, *optional*, defaults to `"server"`):
+            Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or
+            `"colocate"`.
+
+            - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM
+              server is running (start with `trl vllm-serve`).
+            - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a
+              separate server but may cause resource contention with training.
+        vllm_model_impl (`str`, *optional*, defaults to `"vllm"`):
+            Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use
+            the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model
+            implementation.
+        vllm_guided_decoding_regex (`str`, *optional*):
+            Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled.
+
+        > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)
+
+        vllm_server_base_url (`str`, *optional*):
+            Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and
+            `vllm_server_port` are ignored.
+        vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`):
+            Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+        vllm_server_port (`int`, *optional*, defaults to `8000`):
+            Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+        vllm_server_timeout (`float`, *optional*, defaults to `240.0`):
+            Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the
+            timeout, a `ConnectionError` is raised.
+
+        > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)
+
+        vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.3`):
+            Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to
+            `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+            launching the vLLM server via the `--vllm_gpu_memory_utilization` flag.
+        vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
+            Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to
+            `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+            launching the vLLM server via the `--vllm_tensor_parallel_size` flag.
+        vllm_enable_sleep_mode (`bool`, *optional*, defaults to `False`):
+            Enable vLLM sleep mode to offload weights/cache during the optimizer step. Keeps GPU memory usage low, but
+            waking the engine adds host–device transfer latency.
+
+        > Parameters that control the training
+
+        beta (`float`, *optional*, defaults to `0.0`):
+            KL coefficient. If `0.0` (default), the reference model is not loaded, reducing memory usage and improving
+            training speed.
+        num_iterations (`int`, *optional*, defaults to `1`):
+            Number of iterations per batch (denoted as μ in the algorithm).
+        epsilon (`float`, *optional*, defaults to `0.2`):
+            Epsilon value for clipping.
+        delta (`float`, *optional*):
+            Enables the upper clipping bound in two-sided GRPO loss when set to a float. If `None` (default), standard
+            GRPO clipping is used. Recommended to be greater than `1 + ε` when enabled. This method is introduced in
+            the [INTELLECT-2 tech report](https://huggingface.co/papers/2505.07291).
+        epsilon_high (`float`, *optional*):
+            Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound
+            specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`.
+            When used with `loss_type='cispo'`, this corresponds to the ε_max param specified in the [ScaleRL
+            paper](https://arxiv.org/pdf/2510.13786) and the recommended value is `5.0`.
+        importance_sampling_level (`str`, *optional*, defaults to `"token"`):
+            Controls whether importance sampling ratios are computed at the `"token"` or `"sequence"` level. `"token"`
+            keeps the raw per-token log-probability ratios (one weight per token). `"sequence"` averages the
+            log-probability ratios across valid tokens to produce a single ratio per sequence. The [GSPO
+            paper](https://huggingface.co/papers/2507.18071) shows that sequence-level sampling often yields more
+            stable training and better alignment with sequence-level rewards.
+        reward_weights (`list[float]`, *optional*):
+            Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are
+            weighted equally with weight `1.0`.
+        scale_rewards (`str` or `bool`, *optional*, defaults to `"group"`):
+            Specifies the scaling strategy for rewards. Supported values are:
+
+            - `True` or `"group"` (default): rewards are scaled by the standard deviation within each group, ensuring
+              unit variance within a group.
+            - `"batch"`: rewards are scaled by the standard deviation across the entire batch, as recommended in the
+              [PPO Lite paper](https://huggingface.co/papers/2508.08221).
+            - `False` or `"none"`: no scaling is applied. The [Dr. GRPO
+              paper](https://huggingface.co/papers/2503.20783) recommends not scaling rewards, as scaling by the
+              standard deviation introduces a question-level difficulty bias.
+        loss_type (`str`, *optional*, defaults to `"dapo"`):
+            Specifies the loss formulation to use. Supported values are:
+
+            - `"grpo"`: Aggregates token-level losses by normalizing over sequence length. Not recommended due to
+              length bias—this approach tends to prefer shorter completions with positive advantages and longer ones
+              with negative advantages.
+            - `"dr_grpo"`: Aggregates token-level losses by normalizing with a global constant. This method was
+              introduced in the [Dr. GRPO paper](https://huggingface.co/papers/2503.20783) to eliminate length bias.
+              The value of the constant corresponds to `max_completion_length`.
+            - `"dapo"` (default): Aggregates token-level losses by normalizing with the number of active token in the
+              global accumulated batch. This method was introduced in the [DAPO
+              paper](https://huggingface.co/papers/2503.14476) to eliminate length bias.
+            - `"bnpo"`: Aggregates token-level losses by normalizing with the number of active token in the local
+              batch. Note that normalization is performed over the local batch only, so results may slightly vary
+              depending on the local batch size, despite a constant effective batch size. When using
+              `per_device_train_batch_size==1`, the loss is equivalent to the GRPO loss.
+            - `"cispo"`: Clips the importance sampling weights instead of the advantage scaled importance weights. The
+              clipped weights are then multiplied with the advantages and policy model's log probs. Individual token
+              losses are aggregated by normalizing with the number of active tokens in the global accumulated batch.
+              This method was introduced in the [MiniMax-M1 paper](https://huggingface.co/papers/2506.13585).
+        mask_truncated_completions (`bool`, *optional*, defaults to `False`):
+            When enabled, truncated completions are excluded from the loss calculation, preventing them from being
+            incorrectly penalized and introducing noise during training. According to the
+            [DAPO](https://huggingface.co/papers/2503.14476) paper, this is a good practice for training stability.
+        sync_ref_model (`bool`, *optional*, defaults to `False`):
+            Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using
+            the `ref_model_mixup_alpha` parameter. This synchronization originates from the
+            [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
+        ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`):
+            α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix
+            between the current policy and the previous reference policy during updates. The reference policy is
+            updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you
+            must set `sync_ref_model=True`.
+        ref_model_sync_steps (`int`, *optional*, defaults to `512`):
+            τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how
+            frequently the current policy is synchronized with the reference policy. To use this parameter, you must
+            set `sync_ref_model=True`.
+        top_entropy_quantile (`float`, *optional*, defaults to `1.0`):
+            ρ parameter from [Beyond the 80/20 Rule](https://huggingface.co/papers/2506.01939). Keeps in the policy
+            loss term only the top-ρ quantile of tokens by entropy of the probability distribution at each sequence
+            position, improving results. Range: `[0.0-1.0]`. A value of `0.0` masks all but the highest entropy token;
+            `1.0` keeps all tokens. The paper recommends a value of `0.2`. If used with
+            `mask_truncated_completions=True`, only tokens from non-truncated completions are considered.
+        use_liger_loss (`bool`, *optional*):
+            Whether to use Liger loss.
+
+            <Deprecated version="0.25.0">
+
+            Parameter `use_liger_loss` is deprecated and will be removed in version 0.28.0. Use `use_liger_kernel`
+            instead.
+
+            </Deprecated>
+
+        vllm_importance_sampling_correction (`bool`, *optional*, defaults to `True`):
+            Whether to apply Truncated Importance Sampling (TIS) between vLLM completion logprobs and recomputed
+            logprobs. [Your Efficient RL Framework Secretly Brings You Off-Policy RL
+            Training](https://fengyao.notion.site/off-policy-rl) highlights that using a separate generation framework
+            (such as vLLM) can introduce off-policy effects due to subtle implementation differences between generation
+            and training backends. TIS is proposed as a remedy for this issue.
+        vllm_importance_sampling_cap (`float`, *optional*, defaults to `2.0`):
+            Truncation parameter C for Truncated Importance Sampling (TIS). This sets an upper bound on the importance
+            sampling ratio, improving training stability.
+
+        > Parameters that control the logging
+
+        log_completions (`bool`, *optional*, defaults to `False`):
+            Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is installed,
+            it prints the sample. If `wandb` and/or `trackio` logging is enabled, it logs it to `wandb` and/or
+            `trackio`.
+        num_completions_to_print (`int`, *optional*):
+            Number of completions to print with `rich`. If `None`, all completions are logged.
+        wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`):
+            Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all prompts
+            are logged.
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    # Parameters that control the model and reference model
+    model_init_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments for `transformers.AutoModelForCausalLM.from_pretrained`, used when the `model` "
+            "argument of the `GRPOTrainer` is provided as a string."
+        },
+    )
+    disable_dropout: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to disable dropout in the model. This is useful for training with a reference model, as "
+            "it prevents the model from generating different logprobs for the same input."
+        },
+    )
+    cast_lm_head_to_fp32: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to cast the language modeling head of the policy and reference, models to float32."
+            "As recommended by the [ScaleRL](https://huggingface.co/papers/2510.13786) recipe. This flag is only supported when the model"
+            " has untied word embedding and language modeling head layers i.e. `tie_word_embeddings` in the model config is False."
+        },
+    )
+
+    # Parameters that control the data preprocessing
+    # The default value remove_unused_columns is overwritten from the parent class, because in GRPO we usually rely on
+    # additional columns to compute the reward
+    remove_unused_columns: bool | None = field(
+        default=False,
+        metadata={
+            "help": "Whether to only keep the column 'prompt' in the dataset. If you use a custom reward function "
+            "that requires any column other than 'prompts' and 'completions', you should keep this to `False`."
+        },
+    )
+    max_prompt_length: int | None = field(
+        default=512,
+        metadata={
+            "help": "Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left."
+        },
+    )
+    num_generations: int | None = field(
+        default=8,
+        metadata={
+            "help": "Number of generations to sample. The effective batch size (num_processes * per_device_batch_size "
+            "* gradient_accumulation_steps) must be evenly divisible by this value."
+        },
+    )
+    max_completion_length: int | None = field(
+        default=256,
+        metadata={"help": "Maximum length of the generated completion."},
+    )
+    ds3_gather_for_generation: bool = field(
+        default=True,
+        metadata={
+            "help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for "
+            "generation, improving generation speed. However, disabling this option allows training models that "
+            "exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option "
+            "is not compatible with vLLM generation."
+        },
+    )
+    shuffle_dataset: bool | None = field(
+        default=True,
+        metadata={"help": "Whether to shuffle the training dataset."},
+    )
+
+    # Parameters that control generation
+    generation_batch_size: int | None = field(
+        default=None,
+        metadata={
+            "help": "Batch size to use for generation. If `None`, it defaults to the effective training batch size: "
+            "`per_device_train_batch_size * num_processes * steps_per_generation`."
+        },
+    )
+    steps_per_generation: int | None = field(
+        default=None,
+        metadata={
+            "help": "Number of steps per generation. If `None`, it defaults to `gradient_accumulation_steps`."
+        },
+    )
+    temperature: float = field(
+        default=1.0,
+        metadata={
+            "help": "Temperature for sampling. The higher the temperature, the more random the completions."
+        },
+    )
+    top_p: float = field(
+        default=1.0,
+        metadata={
+            "help": "Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. "
+            "Set to 1.0 to consider all tokens."
+        },
+    )
+    top_k: int | None = field(
+        default=None,
+        metadata={
+            "help": "Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, "
+            "top-k-filtering is disabled and all tokens are considered."
+        },
+    )
+    min_p: float | None = field(
+        default=None,
+        metadata={
+            "help": "Minimum token probability, which will be scaled by the probability of the most likely token. It "
+            "must be a value between 0.0 and 1.0. Typical values are in the 0.01-0.2 range."
+        },
+    )
+    generation_kwargs: dict | None = field(
+        default=None,
+        metadata={
+            "help": "Additional keyword arguments to pass to `GenerationConfig` (if using transformers) or "
+            "`SamplingParams` (if using vLLM) when sampling completions. This can be used to further customize the "
+            "generation behavior, such as setting `suppress_tokens`, `num_beams`, etc. If it contains keys that "
+            "conflict with the other generation parameters (like `min_p`, `top_p`, etc.), they will override them."
+        },
+    )
+    chat_template_kwargs: dict | None = field(
+        default=None,
+        metadata={
+            "help": "Additional keyword arguments to pass to the `apply_chat_template` function when generating "
+            "completions."
+        },
+    )
+    repetition_penalty: float = field(
+        default=1.0,
+        metadata={
+            "help": "Float that penalizes new tokens based on whether they appear in the prompt and the generated "
+            "text so far. Values > 1.0 encourage the model to use new tokens, while values < 1.0 encourage the model "
+            "to repeat tokens."
+        },
+    )
+    use_transformers_paged: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use the `transformers` paged implementation for generation. If set to `True`, the "
+            "`transformers` paged implementation will be used for generation instead of the default padded "
+            "implementation. This parameter is only effective when `use_vllm` is set to `False`."
+        },
+    )
+    cache_implementation: str | None = field(
+        default=None,
+        metadata={
+            "help": "Implementation of the cache method for faster generation when use_vllm is set to False."
+        },
+    )
+
+    # Parameters that control generation acceleration powered by vLLM
+    use_vllm: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for "
+            "generation instead of the default model.generate(). Requires `vllm` to be installed."
+        },
+    )
+    vllm_mode: str = field(
+        default="server",
+        metadata={
+            "help": "Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `'server'` or "
+            "`'colocate'`. `'server'`: The trainer will send generation requests to a separate vLLM server. Make sure "
+            "a TRL vLLM server is running (start with `trl vllm-serve`). `'colocate'`: vLLM will run in the same "
+            "process and share the training GPUs. This avoids the need for a separate server but may cause resource "
+            "contention with training."
+        },
+    )
+    vllm_model_impl: str = field(
+        default="vllm",
+        metadata={
+            "help": "Model implementation to use for vLLM. Must be one of `transformers` or `vllm`. `transformers`: "
+            "Use the `transformers` backend for model implementation. `vllm`: Use the `vllm` library for "
+            "model implementation."
+        },
+    )
+    vllm_enable_sleep_mode: bool = field(
+        default=False,
+        metadata={
+            "help": "Enable vLLM sleep mode to offload weights/cache during the optimizer step. Keeps GPU memory "
+            "usage low, but waking the engine adds host–device transfer latency."
+        },
+    )
+    vllm_guided_decoding_regex: str | None = field(
+        default=None,
+        metadata={
+            "help": "Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled."
+        },
+    )
+
+    # Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)
+    vllm_server_base_url: str | None = field(
+        default=None,
+        metadata={
+            "help": "Base URL for the vLLM server (e.g., 'http://localhost:8000'). If provided, `vllm_server_host` "
+            "and `vllm_server_port` are ignored."
+        },
+    )
+    vllm_server_host: str = field(
+        default="0.0.0.0",
+        metadata={
+            "help": "Host of the vLLM server to connect to. Ignored if vllm_server_base_url is provided."
+        },
+    )
+    vllm_server_port: int = field(
+        default=8000,
+        metadata={
+            "help": "Port of the vLLM server to connect to. Ignored if vllm_server_base_url is provided."
+        },
+    )
+    vllm_server_timeout: float = field(
+        default=240.0,
+        metadata={
+            "help": "Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up "
+            "after the timeout, a `ConnectionError` is raised."
+        },
+    )
+
+    # Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)
+    vllm_gpu_memory_utilization: float = field(
+        default=0.3,
+        metadata={
+            "help": "Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set "
+            "to `'colocate'`. If you are using `vllm_mode='server'`, this parameter must be passed separately when "
+            "launching the vLLM server via the `--vllm_gpu_memory_utilization` flag."
+        },
+    )
+    vllm_tensor_parallel_size: int = field(
+        default=1,
+        metadata={
+            "help": "Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set "
+            "to `'colocate'`. If you are using `vllm_mode='server'`, this parameter must be passed separately when "
+            "launching the vLLM server via the `--vllm_tensor_parallel_size` flag."
+        },
+    )
+
+    # Parameters that control the training
+    beta: float = field(
+        default=0.0,
+        metadata={
+            "help": "KL coefficient. If `0.0` (default), the reference model is not loaded, reducing memory usage and "
+            "improving training speed."
+        },
+    )
+    num_iterations: int = field(
+        default=1,
+        metadata={
+            "help": "Number of iterations per batch (denoted as μ in the algorithm)."
+        },
+    )
+    epsilon: float = field(
+        default=0.2,
+        metadata={"help": "Epsilon value for clipping."},
+    )
+    delta: float | None = field(
+        default=None,
+        metadata={
+            "help": "Enables the upper clipping bound in two-sided GRPO loss when set to a float. If `None` "
+            "(default), standard GRPO clipping is used. Recommended to be greater than `1 + ε` when enabled. This "
+            "method is introduced in the [INTELLECT-2 tech report](https://huggingface.co/papers/2505.07291)."
+        },
+    )
+    epsilon_high: float | None = field(
+        default=None,
+        metadata={
+            "help": "Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the "
+            "lower-bound specified in argument `epsilon`. Paper DAPO recommends `0.28`. "
+            "When used with `loss_type='cispo'`, this corresponds to the ε_max param specified in the"
+            "[ScaleRL paper]https://huggingface.co/papers/2510.13786) and the recommended value is `5.0`."
+        },
+    )
+    importance_sampling_level: str = field(
+        default="token",
+        metadata={
+            "help": "Controls whether importance sampling ratios are computed at the `'token'` or `'sequence'` level. "
+            "`'token'` keeps the raw per-token log-probability ratios (one weight per token).  `'sequence'` averages "
+            "the log-probability ratios across valid tokens to produce a single ratio per sequence. The GSPO paper "
+            "shows that sequence-level sampling often yields more stable training and better alignment with "
+            "sequence-level rewards."
+        },
+    )
+    reward_weights: list[float] | None = field(
+        default=None,
+        metadata={
+            "help": "Weights for each reward function. Must match the number of reward functions. If `None`, all "
+            "rewards are weighted equally with weight `1.0`."
+        },
+    )
+    scale_rewards: str = field(
+        default="group",
+        metadata={
+            "help": "Specifies the scaling strategy for rewards. Supported values are: "
+            "`True` or `group'` (default): rewards are scaled by the standard deviation within each group, ensuring "
+            "unit variance within a group. "
+            "`'batch'`: rewards are scaled by the standard deviation across the entire batch, as recommended in the "
+            "PPO Lite paper. "
+            "`False` or `'none'`: no scaling is applied. The Dr. GRPO paper recommends not scaling rewards, as "
+            "scaling by the standard deviation introduces a question-level difficulty bias."
+        },
+    )
+    loss_type: str = field(
+        default="dapo",
+        metadata={
+            "help": "Specifies the loss formulation to use. Supported values are 'grpo', 'dapo', 'bnpo', and "
+            "'dr_grpo'. "
+            "'grpo': Aggregates token-level losses by normalizing over sequence length. Not recommended due to length "
+            "bias—this approach tends to prefer shorter completions with positive advantages and longer ones with "
+            "negative advantages. "
+            "'dapo' (default): Aggregates token-level losses by normalizing with the number of active token in the "
+            "global accumulated batch. This method was introduced in the DAPO paper to eliminate length bias. "
+            "'dr_grpo': Aggregates token-level losses by normalizing with a global constant. This method was "
+            "introduced in the Dr. GRPO paper to eliminate length bias. The value of the constant corresponds to "
+            "`max_completion_length`. "
+            "'bnpo': Aggregates token-level losses by normalizing with the number of active token in the local batch. "
+            "Note that normalization is performed over the local batch only, so results may slightly vary depending "
+            "on the local batch size, despite a constant effective batch size. When using "
+            "`per_device_train_batch_size==1`, the loss is equivalent to the GRPO loss."
+            "'cispo': Clips the importance sampling weights instead of the advantage scaled importance weights. "
+            "The clipped weights are then multiplied with the advantages and policy model's log probs. "
+            "Individual token losses are aggregated by normalizing with the number of active tokens in "
+            "the global accumulated batch. This method was introduced in the "
+            "[MiniMax-M1 paper](https://huggingface.co/papers/2506.13585)."
+        },
+    )
+    mask_truncated_completions: bool = field(
+        default=False,
+        metadata={
+            "help": "When enabled, truncated completions are excluded from the loss calculation, preventing them from "
+            "being incorrectly penalized and introducing noise during training. According to the DAPO paper, this is "
+            "a good practice for training stability."
+        },
+    )
+    sync_ref_model: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to synchronize the reference model with the active model every `ref_model_sync_steps` "
+            "steps, using the `ref_model_mixup_alpha` parameter."
+        },
+    )
+    ref_model_mixup_alpha: float = field(
+        default=0.6,
+        metadata={
+            "help": "α parameter from the TR-DPO paper, which controls the mix between the current policy and the "
+            "previous reference policy during updates. The reference policy is updated according to the equation: "
+            "`π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you must set `sync_ref_model=True`."
+        },
+    )
+    ref_model_sync_steps: int = field(
+        default=512,
+        metadata={
+            "help": "τ parameter from the TR-DPO paper, which determines how frequently the current policy is "
+            "synchronized with the reference policy. To use this parameter, you must set `sync_ref_model=True`."
+        },
+    )
+    top_entropy_quantile: float = field(
+        default=1.0,
+        metadata={
+            "help": "ρ parameter from Beyond the 80/20 Rule. Keeps in the policy loss term only the top-ρ quantile of "
+            "tokens by entropy of the probability distribution at each sequence position, improving results. Range: "
+            "[0.0-1.0]. A value of `0.0` masks all but the highest entropy token; `1.0` keeps all tokens. The paper "
+            "recommends a value of `0.2`. If used with `mask_truncated_completions=True`, only tokens from "
+            "non-truncated completions are considered."
+        },
+    )
+    use_liger_loss: bool = field(
+        default=None,
+        metadata={"help": "Whether to use the Liger GRPO loss."},
+    )
+    vllm_importance_sampling_correction: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to apply Truncated Importance Sampling (TIS) between vLLM completion logprobs and "
+            "recomputed logprobs. Your Efficient RL Framework Secretly Brings You Off-Policy RL "
+            "Training highlights that using a separate generation framework (such as vLLM) can introduce off-policy "
+            "effects due to subtle implementation differences between generation and training backends. TIS is "
+            "proposed as a remedy for this issue."
+        },
+    )
+    vllm_importance_sampling_cap: float = field(
+        default=2.0,
+        metadata={
+            "help": "Truncation parameter C for Truncated Importance Sampling (TIS). This sets an upper bound on the "
+            "importance sampling ratio, improving training stability."
+        },
+    )
+
+    # Parameters that control the logging
+    log_completions: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is "
+            "installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`."
+        },
+    )
+    num_completions_to_print: int | None = field(
+        default=None,
+        metadata={
+            "help": "Number of completions to print with `rich`. If `None`, all completions are logged."
+        },
+    )
+    wandb_log_unique_prompts: bool | None = field(
+        default=False,
+        metadata={
+            "help": "Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, "
+            "all prompts are logged."
+        },
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        super().__post_init__()
+
+        self.scale_rewards = {True: "group", False: "none"}.get(
+            self.scale_rewards, self.scale_rewards
+        )
+
+        num_processes = self.world_size
+        # The current default effective batch size
+        if self.generation_batch_size is None and self.steps_per_generation is None:
+            self.steps_per_generation = self.gradient_accumulation_steps
+            self.generation_batch_size = (
+                self.per_device_train_batch_size
+                * num_processes
+                * self.steps_per_generation
+            )
+        elif (
+            self.generation_batch_size is not None and self.steps_per_generation is None
+        ):
+            # Just ensure the value is divisible by the global batch size
+            if (
+                self.generation_batch_size
+                % (self.per_device_train_batch_size * num_processes)
+                != 0
+            ):
+                raise ValueError(
+                    f"generation_batch_size ({self.generation_batch_size}) must be divisible by the global batch size "
+                    f"({self.per_device_train_batch_size * num_processes})."
+                )
+            self.steps_per_generation = self.generation_batch_size // (
+                self.per_device_train_batch_size * num_processes
+            )
+        elif (
+            self.generation_batch_size is None and self.steps_per_generation is not None
+        ):
+            self.generation_batch_size = (
+                self.per_device_train_batch_size
+                * num_processes
+                * self.steps_per_generation
+            )
+        else:
+            raise ValueError(
+                "'generation_batch_size' and 'steps_per_generation' can not be both configured at the same time"
+            )
+
+        if self.do_eval and self.eval_strategy != "no":
+            # Just ensure the value is divisible by the global batch size
+            if (
+                self.per_device_eval_batch_size * num_processes
+            ) % self.num_generations != 0:
+                raise ValueError(
+                    f"The global eval batch size ({self.per_device_eval_batch_size} * {num_processes}) must be "
+                    f"divisible by num_generations ({self.num_generations})."
+                )
+
+        # The generation batch must contain full prompt groups (no partials), so it must be divisible by
+        # num_generations.
+        if self.generation_batch_size % self.num_generations != 0:
+            raise ValueError(
+                f"generation_batch_size ({self.generation_batch_size}) must be divisible by num_generations "
+                f"({self.num_generations})."
+            )
+
+        if self.num_generations < 2:
+            raise ValueError(
+                "GRPO requires at least 2 generations per prompt to calculate the advantages. You provided "
+                f"{self.num_generations}, which is less than the minimum required."
+            )
+
+        if self.use_liger_loss is not None:
+            warnings.warn(
+                "The `use_liger_loss` argument is deprecated and will be removed in version 0.28.0. Please use "
+                "`use_liger_kernel` instead.",
+                FutureWarning,
+                stacklevel=2,
+            )
+            self.use_liger_kernel = self.use_liger_loss
+
+        if self.delta is not None and self.use_liger_kernel:
+            raise ValueError("Liger kernel does not support two-sided GRPO loss yet.")
diff --git a/src/aixpert/training/training/trl/trainer/grpo_trainer.py b/src/aixpert/training/training/trl/trainer/grpo_trainer.py
new file mode 100644
index 0000000..35d19a3
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/grpo_trainer.py
@@ -0,0 +1,2420 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import textwrap
+import warnings
+from collections import defaultdict, deque
+from collections.abc import Callable
+from contextlib import nullcontext
+from functools import partial
+from pathlib import Path
+from typing import Any
+
+import datasets
+import pandas as pd
+import torch
+import torch.utils.data
+import transformers
+from accelerate import logging
+from accelerate.utils import (
+    broadcast_object_list,
+    gather,
+    gather_object,
+    is_peft_model,
+    set_seed,
+)
+from datasets import Dataset, IterableDataset
+from torch import nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.utils.data import DataLoader, Sampler
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainerCallback,
+    is_trackio_available,
+    is_wandb_available,
+)
+from transformers.trainer_utils import seed_worker
+from transformers.utils import (
+    is_datasets_available,
+    is_peft_available,
+    is_rich_available,
+)
+
+from ..data_utils import (
+    apply_chat_template,
+    is_conversational,
+    prepare_multimodal_messages,
+    prepare_multimodal_messages_vllm,
+)
+from ..extras.profiling import profiling_context, profiling_decorator
+from ..extras.vllm_client import VLLMClient
+from ..import_utils import is_liger_kernel_available, is_vllm_available
+from ..models import (
+    prepare_deepspeed,
+    prepare_fsdp,
+    prepare_peft_model,
+    unwrap_model_for_generation,
+)
+from ..models.utils import _ForwardRedirection
+from .base_trainer import BaseTrainer
+from .callbacks import SyncRefModelCallback
+from .grpo_config import GRPOConfig
+from .utils import (
+    RepeatSampler,
+    disable_dropout_in_model,
+    ensure_master_addr_port,
+    entropy_from_logits,
+    get_config_model_id,
+    identity,
+    nanmax,
+    nanmin,
+    nanstd,
+    pad,
+    print_prompt_completions_sample,
+    selective_log_softmax,
+    shuffle_sequence_dict,
+    split_pixel_values_by_grid,
+    split_tensor_dict,
+    unsplit_pixel_values_by_grid,
+)
+
+
+if is_peft_available():
+    from peft import PeftConfig, PeftModel
+
+if is_liger_kernel_available():
+    from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
+
+if is_vllm_available():
+    from vllm import LLM, SamplingParams
+    from vllm.sampling_params import GuidedDecodingParams
+
+if is_wandb_available():
+    import wandb
+
+if is_trackio_available():
+    import trackio
+
+
+logger = logging.get_logger(__name__)
+
+# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of
+# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model.
+RewardFunc = str | PreTrainedModel | Callable[[list, list], list[float]]
+
+# What we call a rollout function is a callable that takes prompts (list), args (GRPOConfig), and processing_class as
+# parameters and returns a dict of generation results. Those results must include "prompt_ids", "completion_ids", and
+# "logprobs" fields. Any extra fields (per-completion) are forwarded to the reward functions.
+RolloutFunc = Callable[[list[str], Any, Any], dict[str, Any]]
+
+
+class GRPOTrainer(BaseTrainer):
+    """
+    Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
+    paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
+    Models](https://huggingface.co/papers/2402.03300).
+
+    Example:
+
+    ```python
+    from datasets import load_dataset
+    from trl import GRPOTrainer
+
+    dataset = load_dataset("trl-lib/tldr", split="train")
+
+
+    def reward_func(completions, **kwargs):
+        # Dummy reward function that rewards completions with more unique letters.
+        return [float(len(set(completion))) for completion in completions]
+
+
+    trainer = GRPOTrainer(
+        model="Qwen/Qwen2-0.5B-Instruct",
+        reward_funcs=reward_func,
+        train_dataset=dataset,
+    )
+
+    trainer.train()
+    ```
+
+    Args:
+        model (`str | PreTrainedModel`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        reward_funcs (`RewardFunc | list[RewardFunc]`):
+            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+            functions with the prompts and completions and sum the rewards. Can be either:
+
+            - A single reward function, such as:
+                - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
+                path to a *directory* containing model weights saved using
+                [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+                using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
+                keyword arguments in `args.model_init_kwargs`.
+                - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
+                - A custom reward function: The function is provided with the prompts and the generated completions,
+                  plus any additional columns in the dataset. It should return a list of rewards. Custom reward
+                  functions can also return `None` when the reward is not applicable to those samples. This is useful
+                  for multi-task training where different reward functions apply to different types of samples. When a
+                  reward function returns `None` for a sample, that reward function is excluded from the reward
+                  calculation for that sample. For more details, see [Using a custom reward
+                  function](#using-a-custom-reward-function).
+
+                  The trainer's state is also passed to the reward function. The trainer's state is an instance of
+                  [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
+                  reward function's signature.
+            - A list of reward functions, where each item can independently be any of the above types. Mixing different
+            types within the list (e.g., a string model ID and a custom reward function) is allowed.
+        args ([`GRPOConfig`], *optional*):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
+            ignored. The format of the samples can be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Dataset | IterableDataset]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. The padding side must be set to "left". If `None`, the
+            processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
+            padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
+            `tokenizer.eos_token` will be used as the default.
+        reward_processing_classes ([`~transformers.PreTrainedTokenizerBase`] or `list[PreTrainedTokenizerBase]`, *optional*):
+            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+
+            - A single processing class: Used when `reward_funcs` contains only one reward function.
+            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+            If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
+            `None`, the tokenizer for the model is automatically loaded using
+            [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
+            functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
+            are ignored.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+            in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+        rollout_func (`RolloutFunc`, *optional*):
+            Function to use for generating completions. It must take prompts, args, and processing_class as parameters
+            and return a dict with `"prompt_ids"`, `"completion_ids"`, and `"logprobs"` fields. Any other fields that
+            are forwarded to the reward functions. This feature is experimental and may change or be removed at any
+            time without prior notice.
+    """
+
+    _tag_names = ["trl", "grpo"]
+    _name = "GRPO"
+    _paper = {
+        "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
+        "id": "2402.03300",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{shao2024deepseekmath,
+                title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+                author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+                year         = 2024,
+                eprint       = {arXiv:2402.03300},
+            }
+            """),
+    }
+
+    def __init__(
+        self,
+        model: str | PreTrainedModel,
+        reward_funcs: RewardFunc | list[RewardFunc],
+        args: GRPOConfig | None = None,
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset
+        | IterableDataset
+        | dict[str, Dataset | IterableDataset]
+        | None = None,
+        processing_class: PreTrainedTokenizerBase | ProcessorMixin | None = None,
+        reward_processing_classes: PreTrainedTokenizerBase
+        | list[PreTrainedTokenizerBase]
+        | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[
+            torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
+        ] = (None, None),
+        peft_config: "PeftConfig | None" = None,
+        rollout_func: RolloutFunc | None = None,
+    ):
+        # Args
+        if args is None:
+            model_name = (
+                model if isinstance(model, str) else get_config_model_id(model.config)
+            )
+            model_name = model_name.split("/")[-1]
+            args = GRPOConfig(f"{model_name}-GRPO")
+
+        # Models
+        # Trained model
+        model_init_kwargs = args.model_init_kwargs or {}
+        if isinstance(model, str):
+            model_id = model
+            dtype = model_init_kwargs.get("dtype")
+            if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
+                pass  # dtype is already a torch.dtype or "auto" or None
+            elif isinstance(dtype, str):  # it's a str, but not "auto"
+                dtype = getattr(torch, dtype)
+                model_init_kwargs["dtype"] = dtype
+            else:
+                raise ValueError(
+                    "Invalid `dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing "
+                    f"a `torch.dtype` (e.g., 'float32'), but got {dtype}."
+                )
+            # Disable caching if gradient checkpointing is enabled (not supported)
+            config = AutoConfig.from_pretrained(model_id)
+            architecture = getattr(transformers, config.architectures[0])
+            model = architecture.from_pretrained(model_id, **model_init_kwargs)
+        else:
+            model_id = get_config_model_id(model.config)
+            if args.model_init_kwargs is not None:
+                logger.warning(
+                    "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. "
+                    "The `model_init_kwargs` will be ignored."
+                )
+
+        # Some models (SmolVLM/Idefics3) don't support `logits_to_keep` argument and error out if we pass it
+        # Inspect the forward method before we wrap the model with PEFT
+        self.model_kwarg_keys = (
+            inspect.signature(model.forward).parameters.keys()
+            if not hasattr(model, "get_base_model")
+            else inspect.signature(model.get_base_model().forward).parameters.keys()
+        )
+
+        if peft_config is not None or (
+            is_peft_available() and isinstance(model, PeftModel)
+        ):
+            model = prepare_peft_model(model, peft_config, args)
+
+        # Processing class
+        if processing_class is None:
+            processing_class = AutoProcessor.from_pretrained(
+                get_config_model_id(model.config), truncation_side="left"
+            )
+
+        # Handle pad token for processors or tokenizers
+        if isinstance(processing_class, ProcessorMixin):
+            tokenizer = processing_class.tokenizer
+        elif isinstance(processing_class, PreTrainedTokenizerBase):
+            tokenizer = processing_class
+        else:
+            raise TypeError(
+                "The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`"
+            )
+
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        self.pad_token = tokenizer.pad_token
+        self.pad_token_id = tokenizer.pad_token_id
+        self.eos_token_id = tokenizer.eos_token_id
+
+        # Reward functions
+        if not isinstance(reward_funcs, list):
+            reward_funcs = [reward_funcs]
+        self.reward_func_names = []
+        for i, reward_func in enumerate(reward_funcs):
+            if isinstance(reward_func, str):
+                reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained(
+                    reward_func, num_labels=1, **model_init_kwargs
+                )
+            if isinstance(
+                reward_funcs[i], nn.Module
+            ):  # Use Module over PretrainedModel for compat w/ compiled models
+                self.reward_func_names.append(
+                    get_config_model_id(reward_funcs[i].config).split("/")[-1]
+                )
+            else:
+                self.reward_func_names.append(reward_funcs[i].__name__)
+        self.reward_funcs = reward_funcs
+
+        # Reward weights
+        if args.reward_weights is not None:
+            if len(args.reward_weights) != len(reward_funcs):
+                raise ValueError(
+                    f"Number of reward weights ({len(args.reward_weights)}) must match number of reward "
+                    f"functions ({len(reward_funcs)})"
+                )
+            self.reward_weights = torch.tensor(args.reward_weights, dtype=torch.float32)
+        else:
+            self.reward_weights = torch.ones(len(reward_funcs), dtype=torch.float32)
+
+        # Reward processing class
+        if reward_processing_classes is None:
+            reward_processing_classes = [None] * len(reward_funcs)
+        elif not isinstance(reward_processing_classes, list):
+            reward_processing_classes = [reward_processing_classes]
+        if len(reward_processing_classes) != len(reward_funcs):
+            raise ValueError(
+                f"The number of reward processing classes ({len(reward_processing_classes)}) must match the number of "
+                f"reward functions ({len(reward_funcs)})."
+            )
+
+        for i, (reward_processing_class, reward_func) in enumerate(
+            zip(reward_processing_classes, reward_funcs, strict=True)
+        ):
+            if isinstance(reward_func, PreTrainedModel):
+                if reward_processing_class is None:
+                    reward_processing_class = AutoTokenizer.from_pretrained(
+                        get_config_model_id(reward_func.config)
+                    )
+                if reward_processing_class.pad_token_id is None:
+                    reward_processing_class.pad_token = (
+                        reward_processing_class.eos_token
+                    )
+                # The reward model computes the reward for the latest non-padded token in the input sequence.
+                # So it's important to set the pad token ID to the padding token ID of the processing class.
+                reward_func.config.pad_token_id = reward_processing_class.pad_token_id
+                reward_processing_classes[i] = reward_processing_class
+
+        self.reward_processing_classes = reward_processing_classes
+
+        # Rollout function
+        if (
+            rollout_func is not None
+            and os.environ.get("TRL_EXPERIMENTAL_SILENCE", "0") != "1"
+        ):
+            warnings.warn(
+                "You are importing from 'rollout_func', which is an experimental feature. This API may change or be "
+                "removed at any time without prior notice. Silence this warning by setting environment variable "
+                "TRL_EXPERIMENTAL_SILENCE=1.",
+                UserWarning,
+                stacklevel=2,
+            )
+        self.rollout_func = rollout_func
+
+        # Training arguments
+        self.max_prompt_length = args.max_prompt_length
+        self.max_completion_length = (
+            args.max_completion_length
+        )  # = |o_i| in the GRPO paper
+        self.num_generations = args.num_generations  # = G in the GRPO paper
+        self.chat_template_kwargs = args.chat_template_kwargs or {}
+        self.temperature = args.temperature
+        self.top_p = args.top_p
+        self.top_k = args.top_k
+        self.min_p = args.min_p
+        self.repetition_penalty = args.repetition_penalty
+        self.use_transformers_paged = args.use_transformers_paged
+        self.use_vllm = args.use_vllm
+        self.vllm_mode = args.vllm_mode
+        self.vllm_gpu_memory_utilization = (
+            args.vllm_gpu_memory_utilization
+        )  # only applies to colocation mode
+        self.vllm_tensor_parallel_size = (
+            args.vllm_tensor_parallel_size
+        )  # only applies to colocation mode
+        self.vllm_importance_sampling_correction = (
+            args.vllm_importance_sampling_correction
+        )
+        self.vllm_importance_sampling_cap = args.vllm_importance_sampling_cap
+        self.use_liger_kernel = args.use_liger_kernel
+        self.loss_type = args.loss_type
+        self.scale_rewards = args.scale_rewards
+        self.importance_sampling_level = args.importance_sampling_level
+        self.mask_truncated_completions = args.mask_truncated_completions
+        self.top_entropy_quantile = args.top_entropy_quantile
+        if self.use_liger_kernel and self.top_entropy_quantile < 1.0:
+            raise NotImplementedError(
+                "Liger Kernels don't currently support masking token positions based on entropy."
+            )
+        if self.use_liger_kernel and not self.importance_sampling_level == "token":
+            raise NotImplementedError(
+                "Liger Kernels currently only support token-level importance sampling. Please set"
+                "`importance_sampling_level` to 'token'."
+            )
+
+        # Datasets
+        self.shuffle_dataset = args.shuffle_dataset
+
+        if (
+            isinstance(train_dataset, IterableDataset)
+            or isinstance(eval_dataset, IterableDataset)
+            or (
+                isinstance(eval_dataset, dict)
+                and any(isinstance(ds, IterableDataset) for ds in eval_dataset.values())
+            )
+        ):
+            # See https://github.com/huggingface/trl/issues/3213
+            raise NotImplementedError(
+                "Iterable datasets are not yet supported in GRPOTrainer. Please use a standard dataset instead."
+            )
+
+        # Multi-step
+        self.num_iterations = args.num_iterations  # = 𝜇 in the GRPO paper
+        self.epsilon_low = args.epsilon
+        self.epsilon_high = (
+            args.epsilon_high if args.epsilon_high is not None else args.epsilon
+        )
+        # Tracks the number of iterations (forward + backward passes), including those within a grad accum cycle
+        self._step = 0
+        # Buffer the batch to reuse generated outputs across multiple updates. For more details, see
+        # `_get_train_sampler` and `_prepare_inputs`.
+        self._buffered_inputs = None
+
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in GRPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning:
+        # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To
+        # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True.
+        # This acts as a flag to indicate that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=identity,  # No data collation is needed in GRPO
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            # In Trainer, `training_step` scales the loss by `gradient_accumulation_steps` only if `compute_loss_func`
+            # is None. For DAPO, loss scaling instead depends on the total number of completions tokens across the
+            # global accumulated batch. To control scaling ourselves, we must disable Trainer’s built-in scaling. The
+            # simplest (though a bit hacky) way is to set `compute_loss_func` to any non-None value, which bypasses
+            # that behavior without rewriting `training_step`.
+            compute_loss_func="non-None value to disable scaling",
+        )
+
+        # Reference model
+        self.beta = args.beta
+        if self.beta == 0.0:
+            # If beta is 0.0, the reference model is not needed
+            self.ref_model = None
+        elif is_peft_model(model):
+            # If PEFT is used, the reference model is not needed since the adapter can be disabled
+            # to revert to the initial model.
+            self.ref_model = None
+        else:
+            # For deepspeed, fsdp or non-distributed models, create a reference model from scratch
+            config = AutoConfig.from_pretrained(model_id)
+            architecture = getattr(transformers, config.architectures[0])
+            self.ref_model = architecture.from_pretrained(model_id, **model_init_kwargs)
+
+        # Disable dropout in the models
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        # Cast LM Head To FP32
+        if args.cast_lm_head_to_fp32:
+
+            def _cast_lm_head_to_fp32(target_model: PreTrainedModel):
+                """Cast lm_head to fp32 while preserving embedding output dtype if tied."""
+
+                def cast_inputs_to_fp32(module, inputs):
+                    # Preserve other positional args and kwargs untouched
+                    if not inputs:
+                        return inputs
+                    return (inputs[0].to(torch.float32),) + inputs[1:]
+
+                original_dtype_local = target_model.lm_head.weight.dtype
+                target_model.lm_head = target_model.lm_head.float()
+                target_model.lm_head.register_forward_pre_hook(cast_inputs_to_fp32)
+
+                if target_model.config.tie_word_embeddings:
+
+                    def cast_outputs_to_original_dtype(module, args, output):
+                        return output.to(original_dtype_local)
+
+                    # Only cast activations; weights are now fp32 (intentional for numerical stability of logits)
+                    target_model.model.embed_tokens.register_forward_hook(
+                        cast_outputs_to_original_dtype
+                    )
+
+            _cast_lm_head_to_fp32(model)
+            if self.ref_model is not None:
+                _cast_lm_head_to_fp32(self.ref_model)
+
+        # Liger loss
+        if self.use_liger_kernel:
+            if not is_liger_kernel_available():
+                raise ImportError(
+                    "Liger is required to use `use_liger_kernel` as the GRPO loss. Run `pip install liger-kernel`."
+                )
+            # redirect the model.module forward to the model forward to ensure pre-forward hooks are called
+            self._forward_redirection = _ForwardRedirection()
+
+            self.liger_grpo_loss = LigerFusedLinearGRPOLoss(
+                beta=self.beta,
+                epsilon_low=self.epsilon_low,
+                epsilon_high=self.epsilon_high,
+                temperature=self.temperature,
+                use_ref_model=self.beta != 0.0,
+                loss_type=self.loss_type,
+                max_completion_length=self.max_completion_length,
+            )
+
+        # Initialize the metrics
+        self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)}
+        self._total_train_tokens = 0
+        self.log_completions = args.log_completions
+        self.wandb_log_unique_prompts = args.wandb_log_unique_prompts
+        self.num_completions_to_print = args.num_completions_to_print
+        # Keep logs sized to the generation batch to record only outputs from the latest model update.
+        self._logs = {
+            "images": deque(maxlen=args.generation_batch_size),
+            "prompt": deque(maxlen=args.generation_batch_size),
+            "completion": deque(maxlen=args.generation_batch_size),
+            "rewards": defaultdict(lambda: deque(maxlen=args.generation_batch_size)),
+            "advantages": deque(maxlen=args.generation_batch_size),
+        }
+
+        # Ensure each process receives a unique seed to prevent duplicate completions when generating with
+        # transformers if num_generations exceeds per_device_train_batch_size. We could skip it if we use vLLM, but
+        # it's safer to set it in all cases.
+        set_seed(args.seed, device_specific=True)
+
+        if self.use_vllm:
+            if not is_vllm_available():
+                raise ImportError(
+                    "vLLM is not available and `use_vllm` is set to True. Please install vLLM with "
+                    "`pip install trl[vllm]` to use it."
+                )
+
+            if self.vllm_mode == "server":
+                if self.accelerator.is_main_process:
+                    if args.vllm_server_base_url is not None:
+                        base_url = args.vllm_server_base_url
+                    else:
+                        base_url = (
+                            f"http://{args.vllm_server_host}:{args.vllm_server_port}"
+                        )
+                    self.vllm_client = VLLMClient(
+                        base_url=base_url, connection_timeout=args.vllm_server_timeout
+                    )
+                    self.vllm_client.init_communicator(
+                        device=torch.cuda.current_device()
+                    )
+
+            elif self.vllm_mode == "colocate":
+                # Make sure vllm_tensor_parallel_size group size evenly divides the world size - each group should have
+                # the same number of ranks
+                if (
+                    not self.accelerator.num_processes % self.vllm_tensor_parallel_size
+                    == 0
+                ):
+                    raise ValueError(
+                        f"vllm_tensor_parallel_size ({self.vllm_tensor_parallel_size}) must divide world size "
+                        f"({self.accelerator.num_processes}) evenly."
+                    )
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Create subgroups of ranks for TP, each group with `vllm_tensor_parallel_size` ranks.
+                    # For example, if world_size=8 and vllm_tensor_parallel_size=2 → groups: [0,1], [2,3], [4,5], [6,7]
+                    self.tp_group, _ = torch.distributed.new_subgroups_by_enumeration(
+                        [
+                            list(
+                                range(
+                                    i * self.vllm_tensor_parallel_size,
+                                    (i + 1) * self.vllm_tensor_parallel_size,
+                                )
+                            )
+                            for i in range(
+                                self.accelerator.num_processes
+                                // self.vllm_tensor_parallel_size
+                            )
+                        ]
+                    )
+
+                # vLLM requires the environment variables to be set for distributed training.
+                os.environ["RANK"] = str(self.accelerator.process_index)
+                os.environ["LOCAL_RANK"] = str(self.accelerator.local_process_index)
+                os.environ["WORLD_SIZE"] = str(self.accelerator.num_processes)
+                # Ensure distributed rendezvous variables are set without colliding across concurrent runs
+                ensure_master_addr_port()
+
+                if (
+                    self.max_prompt_length is not None
+                    and self.max_completion_length is not None
+                ):
+                    max_model_len = self.max_prompt_length + self.max_completion_length
+                else:
+                    max_model_len = None
+                self.llm = LLM(
+                    model=model.name_or_path,
+                    tensor_parallel_size=args.vllm_tensor_parallel_size,
+                    gpu_memory_utilization=self.vllm_gpu_memory_utilization,
+                    max_num_seqs=self.args.per_device_train_batch_size
+                    * self.vllm_tensor_parallel_size
+                    * self.args.steps_per_generation,
+                    max_model_len=max_model_len,
+                    distributed_executor_backend="external_launcher",
+                    # Feed identical seed for tp groups to ensure sampling results are the same across workers
+                    seed=self.accelerator.process_index
+                    // self.vllm_tensor_parallel_size,
+                    # Latest vLLM v1 memory profiler is misled by the high default value (i.e., 32768) - thinking there's not enough memory
+                    max_num_batched_tokens=4096,
+                    model_impl=self.args.vllm_model_impl,
+                    enable_sleep_mode=self.args.vllm_enable_sleep_mode,
+                    # Important so temperature scaling/logit tweaking affects the TIS log probs
+                    logprobs_mode="processed_logprobs",
+                )
+                if self.args.vllm_enable_sleep_mode:
+                    self.llm.sleep(level=2)
+            else:
+                raise ValueError(
+                    f"vllm_mode must be either 'server' or 'colocate', got '{self.vllm_mode}'."
+                )
+
+            # vLLM specific sampling arguments
+            self.guided_decoding_regex = args.vllm_guided_decoding_regex
+
+            self._last_loaded_step = (
+                -1
+            )  # tag to avoid useless loading during grad accumulation
+
+            # When using vLLM, the main process is responsible for loading the model weights. This can cause process
+            # desynchronization and seems to lead to DeepSpeed hanging during initialization. To prevent this, we
+            # synchronize all processes after vLLM has been fully initialized.
+            self.accelerator.wait_for_everyone()
+        else:
+            generation_kwargs = {
+                "max_new_tokens": self.max_completion_length,
+                "do_sample": True,
+                "pad_token_id": tokenizer.pad_token_id,
+                "bos_token_id": tokenizer.bos_token_id,
+                "eos_token_id": tokenizer.eos_token_id,
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "top_k": self.top_k,
+                "min_p": self.min_p,
+                "repetition_penalty": self.repetition_penalty,
+                "cache_implementation": args.cache_implementation,
+            }
+            if args.generation_kwargs is not None:
+                generation_kwargs.update(args.generation_kwargs)
+            self.generation_config = GenerationConfig(**generation_kwargs)
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags to the model
+        self.model.add_model_tags(self._tag_names)
+
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            elif self.is_fsdp_enabled:
+                self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(
+                    self.ref_model, evaluation_mode=True
+                )
+
+        if args.sync_ref_model:
+            self.add_callback(
+                SyncRefModelCallback(
+                    ref_model=self.ref_model, accelerator=self.accelerator
+                )
+            )
+
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                if self.is_deepspeed_enabled:
+                    self.reward_funcs[i] = prepare_deepspeed(
+                        reward_func, self.accelerator
+                    )
+                else:
+                    # set device placement to True to make `prepare_model` move `reward_func` to device when using fsdp
+                    self.reward_funcs[i] = self.accelerator.prepare_model(
+                        reward_func, evaluation_mode=True, device_placement=True
+                    )
+
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by the `training_step` method, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = ["prompt", "image", "images"]
+
+    # This method overrides `Trainer.get_train_dataloader` to support our custom batching strategy.
+    # Instead of returning a standard per-step batch (i.e., `per_device_batch_size), our dataloader loads an
+    # *generation* batch (i.e., `per_device_batch_size × steps_per_generation`). This allows us to generate completions
+    # once every steps_per_generation step—rather than once per accumulation step—which is significantly more
+    # efficient. The only change from the original implementation is multiplying the batch size by
+    # `steps_per_generation`. Thus, `_prepare_inputs` is called with this *generation* batch, and it handles the
+    # splitting internally.
+    # Maintenance note: This method is a copy-paste of the original `Trainer.get_train_dataloader` with only one line
+    # modification. As a result, some parts of the method aren't relevant to GRPO, but we keep them to stay one line
+    # apart from the super method, ensuring easier maintenance in the future.
+    def get_train_dataloader(self):
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+
+        train_dataset = self.train_dataset
+        data_collator = self.data_collator
+        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
+            train_dataset = self._remove_unused_columns(
+                train_dataset, description="training"
+            )
+        else:
+            data_collator = self._get_collator_with_removed_columns(
+                data_collator, description="training"
+            )
+
+        dataloader_params = {
+            "batch_size": self._train_batch_size
+            * self.args.steps_per_generation,  # < this is the change
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+        }
+
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_train_sampler()
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = partial(
+                seed_worker,
+                num_workers=self.args.dataloader_num_workers,
+                rank=self.args.process_index,
+            )
+
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
+
+        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
+
+    def _get_train_sampler(self, dataset: Dataset | None = None) -> Sampler:
+        # Returns a sampler that
+        # 1. ensures each prompt is repeated across multiple processes. This guarantees that identical prompts are
+        #    distributed to different GPUs, allowing rewards to be computed and normalized correctly within each prompt
+        #    group. Using the same seed across processes ensures consistent prompt assignment, preventing discrepancies
+        #    in group formation.
+        # 2. repeats the batch multiple times to allow reusing generations across multiple updates. Refer to
+        #    _prepare_inputs to see how the generations are stored and reused.
+
+        # In the following figure, the values are the prompt indices. The first row shows the first sampled batch, the
+        # second row shows the second sampled batch, and so on.
+        #
+        #                                      |   GPU 0  |   GPU 1  |
+        #
+        #                 global_step   step    <-───>  num_generations=2
+        #                                       <-───────> per_device_train_batch_size=3
+        #  grad_accum    ▲  ▲  0          0     0   0   1   1   2   2   <- Generate for the first `steps_per_generation` (prompts 0 to 11); store the completions; use the first slice to compute the loss
+        #     =2         ▼  |  0          1     3   3   4   4   5   5   <- Take the stored generations and use the second slice to compute the loss
+        #                   |
+        #                   |  1          2     6   6   7   7   8   8   <- Take the stored generations and use the third slice to compute the loss
+        #  steps_per_gen=4  ▼  1          3     9   9  10  10  11  11   <- Take the stored generations and use the fourth slice to compute the loss
+        #
+        #                      2          4    12  12  13  13  14  14   <- Generate for the second `steps_per_generation` (prompts 12 to 23); store the completions; use the first slice to compute the loss
+        #                      2          5    15  15  16  16  17  17   <- Take the stored generations and use the second slice to compute the loss
+        #                                          ...
+        if dataset is None:
+            dataset = self.train_dataset
+        return RepeatSampler(
+            data_source=dataset,
+            mini_repeat_count=self.num_generations,
+            batch_size=self.args.generation_batch_size // self.num_generations,
+            repeat_count=self.num_iterations * self.args.steps_per_generation,
+            shuffle=self.shuffle_dataset,
+            seed=self.args.seed,
+        )
+
+    def _get_eval_sampler(self, eval_dataset) -> Sampler:
+        # See _get_train_sampler for an explanation of the sampler.
+        return RepeatSampler(
+            data_source=eval_dataset,
+            mini_repeat_count=self.num_generations,
+            seed=self.args.seed,
+        )
+
+    @profiling_decorator
+    def _get_last_hidden_state(
+        self,
+        unwrapped_model,
+        input_ids,
+        attention_mask,
+        logits_to_keep,
+        pixel_values=None,
+        image_grid_thw=None,
+        pixel_attention_mask=None,
+        image_sizes=None,
+    ):
+        if is_peft_model(unwrapped_model):
+            unwrapped_model = unwrapped_model.base_model.model
+
+        # Build model inputs - check if the model supports logits_to_keep (some models and VLMs don't)
+        model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
+
+        # For Qwen models:
+        if image_grid_thw is not None and pixel_values is not None:
+            model_inputs["image_grid_thw"] = image_grid_thw
+        # For Gemma, SmolVLM2, LLaVa-Next etc.:
+        if pixel_values is not None:
+            model_inputs["pixel_values"] = pixel_values
+        # For SmolVLM2
+        if pixel_attention_mask is not None:
+            model_inputs["pixel_attention_mask"] = pixel_attention_mask
+        # For LLaVa-Next
+        if image_sizes is not None:
+            model_inputs["image_sizes"] = image_sizes
+
+        # Only add logits_to_keep if the model supports it
+        if "logits_to_keep" in self.model_kwarg_keys:
+            # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
+            model_inputs["logits_to_keep"] = logits_to_keep + 1
+
+        model_inputs["use_cache"] = (
+            False  # only used in generation; set False to suppress warnings
+        )
+
+        last_hidden_state = unwrapped_model.model(**model_inputs).last_hidden_state
+        # Exclude the last value: it corresponds to the next token pred
+        last_hidden_state = last_hidden_state[:, :-1, :]  # (B, L-1, H)
+        # Only keep the last logits_to_keep. For model that support logits_to_keep, this is a no-op.
+        last_hidden_state = last_hidden_state[
+            :, -logits_to_keep:, :
+        ]  # (B, logits_to_keep, H)
+        return last_hidden_state
+
+    def get_high_entropy_mask(
+        self, entropies: torch.Tensor, mask: torch.Tensor, threshold: float
+    ) -> torch.Tensor:
+        """
+        Returns a binary mask identifying tokens whose entropy exceeds a given quantile threshold.
+
+        Args:
+            entropies (`torch.Tensor`):
+                Tensor of shape (batch_size, seq_len) with per-token entropy values.
+            mask (`torch.Tensor`):
+                Binary mask of the same shape as `entropies`, where `1` indicates valid tokens and `0` padding.
+            threshold (`float`):
+                Quantile threshold between `0.0` and `1.0` to select high-entropy tokens.
+
+        Returns
+        -------
+            `torch.Tensor`:
+                Boolean mask of shape (batch_size, seq_len), where `True` indicates tokens with entropy >= threshold
+                and `False` otherwise.
+        """
+        local = entropies[mask.bool()].float()
+
+        # Use a negative pad_value as a sentinel because entropy values are always >= 0.
+        # This guarantees that the sentinel cannot collide with any real entropy value.
+        pad_value = -1e9
+
+        # Pad across processes so that every rank has the same tensor length
+        padded = self.accelerator.pad_across_processes(
+            local, dim=0, pad_index=pad_value
+        )
+        gathered = self.accelerator.gather(padded)
+
+        # Drop sentinel values (safe because no entropy can be negative)
+        gathered = gathered[gathered != pad_value]
+
+        if gathered.numel() == 0:
+            return torch.zeros_like(entropies, dtype=torch.bool)
+
+        entropy_threshold = torch.quantile(gathered, threshold)
+        masked_entropies = entropies * mask.float()
+        entropy_mask = masked_entropies >= entropy_threshold
+        return entropy_mask & mask.bool()  # ensure padding tokens are always masked out
+
+    @profiling_decorator
+    def _get_per_token_logps_and_entropies(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        logits_to_keep,
+        batch_size=None,
+        compute_entropy=False,
+        pixel_values=None,
+        image_grid_thw=None,
+        num_images=None,
+        pixel_attention_mask=None,
+        image_sizes=None,
+        token_type_ids=None,
+    ) -> dict[str, torch.Tensor | None]:
+        """Compute log-probs and (optionally) entropies for each token."""
+        batch_size = batch_size or input_ids.size(
+            0
+        )  # Chunk inputs into smaller batches to reduce memory peak
+        all_logps = []
+        all_entropies = []
+        for start in range(0, input_ids.size(0), batch_size):
+            input_ids_batch = input_ids[start : start + batch_size]
+            attention_mask_batch = attention_mask[start : start + batch_size]
+
+            # Build model inputs - check if the model supports logits_to_keep (some models and VLMs don't)
+            model_inputs = {
+                "input_ids": input_ids_batch,
+                "attention_mask": attention_mask_batch,
+            }
+            if image_grid_thw is not None and pixel_values is not None:
+                rows_per_image = image_grid_thw.prod(dim=-1)
+                rows_per_sample = torch.split(rows_per_image, num_images)
+                rows_per_sample = torch.stack([s.sum() for s in rows_per_sample])
+                cum_rows = torch.cat(
+                    [
+                        torch.tensor([0], device=rows_per_sample.device),
+                        rows_per_sample.cumsum(0),
+                    ]
+                )
+                row_start, row_end = (
+                    cum_rows[start].item(),
+                    cum_rows[start + batch_size].item(),
+                )
+                model_inputs["pixel_values"] = pixel_values[row_start:row_end]
+                cum_imgs = torch.tensor([0] + num_images).cumsum(0)
+                img_start, img_end = cum_imgs[start], cum_imgs[start + batch_size]
+                model_inputs["image_grid_thw"] = image_grid_thw[img_start:img_end]
+            elif pixel_values is not None:
+                model_inputs["pixel_values"] = pixel_values[start : start + batch_size]
+            if pixel_attention_mask is not None:
+                model_inputs["pixel_attention_mask"] = pixel_attention_mask[
+                    start : start + batch_size
+                ]
+            if image_sizes is not None:
+                model_inputs["image_sizes"] = image_sizes[start : start + batch_size]
+            if token_type_ids is not None:
+                model_inputs["token_type_ids"] = token_type_ids[
+                    start : start + batch_size
+                ]
+
+            # Only add logits_to_keep if the model supports it
+            if "logits_to_keep" in self.model_kwarg_keys:
+                # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
+                model_inputs["logits_to_keep"] = logits_to_keep + 1
+
+            model_inputs["use_cache"] = (
+                False  # only used in generation; set False to suppress warnings
+            )
+
+            logits = model(**model_inputs).logits
+            # Exclude the last value: it corresponds to the next token pred
+            logits = logits[:, :-1, :]  # (B, L-1, H)
+            # Only keep the last logits_to_keep. For model that support logits_to_keep, this is a no-op.
+            logits = logits[:, -logits_to_keep:, :]  # (B, logits_to_keep, H)
+            # Divide logits by sampling temperature.
+            # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
+            logits = logits / self.temperature
+            completion_ids = input_ids_batch[:, -logits_to_keep:]
+            logps = selective_log_softmax(logits, completion_ids)  # compute logprobs
+            all_logps.append(logps)
+
+            if compute_entropy:
+                with torch.no_grad():
+                    entropies = entropy_from_logits(logits)
+                all_entropies.append(entropies)
+
+        logps = torch.cat(all_logps, dim=0)
+        entropies = torch.cat(all_entropies, dim=0) if compute_entropy else None
+        return logps, entropies
+
+    def _fix_param_name_to_vllm(self, name, extra_prefixes: list[str] | None = None):
+        extra_prefixes = extra_prefixes or []
+        prefixes = ["_checkpoint_wrapped_module."] + extra_prefixes
+        for prefix in prefixes:
+            name = name.replace(prefix, "")
+        return name
+
+    def _sync_fsdp1_params_to_vllm(
+        self, module: nn.Module, prefix: str = "", visited=None
+    ):
+        """Memory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM."""
+        # For FSDP1, we need to recurse into children and also use summon_full_params
+        if visited is None:
+            visited = set()
+        for child_name, child_module in module.named_children():
+            child_prefix = f"{prefix}.{child_name}" if prefix else child_name
+            self._sync_fsdp1_params_to_vllm(
+                child_module, prefix=child_prefix, visited=visited
+            )  # recurse into the child
+
+        if isinstance(module, FSDP):
+            with FSDP.summon_full_params(module, recurse=False, writeback=False):
+                for param_name, param in module.named_parameters():
+                    full_name = f"{prefix}.{param_name}" if prefix else param_name
+                    full_name = self._fix_param_name_to_vllm(
+                        full_name, extra_prefixes=["_fsdp_wrapped_module."]
+                    )
+
+                    if full_name in visited:
+                        continue  # skip FSDP subtrees already traversed
+                    visited.add(full_name)
+
+                    if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(full_name, param.data)
+                    elif self.vllm_mode == "colocate":
+                        llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                        llm_model.load_weights([(full_name, param.data)])
+
+    def _sync_fsdp2_params_to_vllm(self, module: nn.Module):
+        # For FSDP2, module.state_dict() already covers all parameters, so no need for recursion
+        for name, param in module.state_dict().items():
+            if param.is_cpu:
+                param = param.to(torch.device("cuda"))
+            param = param.full_tensor()
+
+            if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                self.vllm_client.update_named_param(name, param)
+            elif self.vllm_mode == "colocate":
+                llm_model = (
+                    self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                )
+                llm_model.load_weights([(name, param)])
+
+    @profiling_decorator
+    def _move_model_to_vllm(self):
+        # For DeepSpeed ZeRO-3 and FSDP, we need to gather all parameters before operations
+        deepspeed_plugin = self.accelerator.state.deepspeed_plugin
+        zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
+        if zero_stage_3:
+            import deepspeed
+
+            gather_if_zero3 = deepspeed.zero.GatheredParameters
+        else:
+            gather_if_zero3 = nullcontext
+
+        if is_peft_model(self.model):
+            # With PEFT and FSDP/DeepSpeed ZeRO Stage 3, we must gather the full model at once before merging, as
+            # merging adapters in a sharded manner is not supported.
+            # TODO: does this work with FSDP?
+            with gather_if_zero3(list(self.model.parameters())):
+                self.model.merge_adapter()
+
+                # Update vLLM weights while parameters are gathered
+                if (
+                    self.is_fsdp_enabled
+                ):  # note if using FSDP, gather_if_zero3 is nullcontext
+                    # Update vLLM weights while parameters are gathered
+                    # For PEFT with FSDP we need to use the memory efficient post-order traversal
+                    fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None)
+                    fsdp_version = (
+                        getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1
+                    )
+                    if fsdp_version == 1:
+                        self._sync_fsdp1_params_to_vllm(
+                            self.model
+                        )  # use memory-efficient post-order traversal for FSDP
+                    elif fsdp_version == 2:
+                        self._sync_fsdp2_params_to_vllm(self.model)
+                else:
+                    # DeepSpeed ZeRO-3 with PEFT
+                    for name, param in self.model.named_parameters():
+                        # When using PEFT, we need to recover the original parameter name and discard some parameters
+                        name = name.removeprefix("base_model.model.").replace(
+                            ".base_layer", ""
+                        )
+                        if self.model.prefix in name:
+                            continue
+                        # When module to save, remove its prefix and discard the original module
+                        if "original_module" in name:
+                            continue
+                        name = self._fix_param_name_to_vllm(
+                            name, extra_prefixes=["modules_to_save.default."]
+                        )
+
+                        if (
+                            self.vllm_mode == "server"
+                            and self.accelerator.is_main_process
+                        ):
+                            self.vllm_client.update_named_param(name, param.data)
+                        elif self.vllm_mode == "colocate":
+                            llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                            llm_model.load_weights([(name, param.data)])
+                # Unmerge adapters while parameters are still gathered
+                self.model.unmerge_adapter()
+                # Parameters will automatically be repartitioned when exiting the context
+        # For non-PEFT models, simply gather (if needed) and update each parameter individually.
+        elif self.is_fsdp_enabled:
+            fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None)
+            fsdp_version = getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1
+            if fsdp_version == 1:
+                self._sync_fsdp1_params_to_vllm(
+                    self.model
+                )  # use memory-efficient post-order traversal for FSDP
+            elif fsdp_version == 2:
+                self._sync_fsdp2_params_to_vllm(self.model)
+        else:
+            for name, param in self.model.named_parameters():
+                name = self._fix_param_name_to_vllm(name)
+                with gather_if_zero3([param]):
+                    if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(name, param.data)
+                    elif self.vllm_mode == "colocate":
+                        llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                        llm_model.load_weights([(name, param.data)])
+
+        # Reset cache on vLLM
+        if self.vllm_mode == "server" and self.accelerator.is_main_process:
+            self.vllm_client.reset_prefix_cache()
+        elif self.vllm_mode == "colocate":
+            self.llm.reset_prefix_cache()
+
+    @profiling_decorator
+    def _prepare_inputs(
+        self, generation_batch: dict[str, torch.Tensor | Any]
+    ) -> dict[str, torch.Tensor | Any]:
+        # Prepares inputs for model training/evaluation by managing completion generation and batch handling.
+        # During training:
+        #   - Receives the local generation batch (Per-GPU batch size × steps per generation)
+        #     from the modified training dataloader instead of the standard local batch
+        #   - Generates completions once for the entire generation batch and splits it into batches of size
+        #     `per_device_train_batch_size`
+        #   - Buffers these completions and returns the appropriate slice for the current accumulation step
+        #   - Optimizes by regenerating completions only periodically (every steps_per_generation * num_iterations)
+        # During evaluation:
+        #   - The input is treated as a standard local batch (no accumulation, no multiple iterations)
+        #   - Completions are generated for each batch without buffering or reuse
+        # Returns a single local batch in both cases.
+
+        mode = "train" if self.model.training else "eval"
+        if mode == "train":
+            generate_every = self.args.steps_per_generation * self.num_iterations
+            if self._step % generate_every == 0 or self._buffered_inputs is None:
+                # self._buffered_inputs=None can occur when resuming from a checkpoint
+                generation_batch = self._generate_and_score_completions(
+                    generation_batch
+                )
+                generation_batch = split_pixel_values_by_grid(generation_batch)
+                generation_batch = shuffle_sequence_dict(generation_batch)
+                generation_batches = split_tensor_dict(
+                    generation_batch, self.args.steps_per_generation
+                )
+                self._buffered_inputs = [
+                    unsplit_pixel_values_by_grid(batch) for batch in generation_batches
+                ]
+            inputs = self._buffered_inputs[self._step % self.args.steps_per_generation]
+            self._step += 1
+        else:
+            # In evaluation, there is neither batch grouping for generation, nor multiple iterations, hence
+            # local generation batch == local eval batch
+            inputs = self._generate_and_score_completions(generation_batch)
+        return inputs
+
+    @profiling_decorator
+    def _calculate_rewards(self, inputs, prompts, completions, completion_ids_list):
+        device = self.accelerator.device
+        rewards_per_func = torch.zeros(
+            len(prompts), len(self.reward_funcs), device=device
+        )
+
+        # Repeat all input columns (but "prompt", "completion", and "completion_ids") to match the num of generations
+        keys = [
+            key
+            for key in inputs[0]
+            if key not in ["prompt", "completion", "completion_ids"]
+        ]
+        reward_kwargs = {key: [example[key] for example in inputs] for key in keys}
+
+        # This allows for dynamic reward shaping based on training progress.
+        reward_kwargs["trainer_state"] = self.state
+
+        for i, (reward_func, reward_processing_class, reward_func_name) in enumerate(
+            zip(
+                self.reward_funcs,
+                self.reward_processing_classes,
+                self.reward_func_names,
+                strict=True,
+            )
+        ):
+            with profiling_context(self, reward_func_name):
+                if isinstance(
+                    reward_func, nn.Module
+                ):  # Module (no PretrainedModel) for compat with compiled models
+                    if is_conversational(inputs[0]):
+                        messages = [
+                            {"messages": p + c}
+                            for p, c in zip(prompts, completions, strict=True)
+                        ]
+                        texts = [
+                            apply_chat_template(
+                                x, reward_processing_class, **self.chat_template_kwargs
+                            )["text"]
+                            for x in messages
+                        ]
+                    else:
+                        texts = [
+                            p + c for p, c in zip(prompts, completions, strict=True)
+                        ]
+                    reward_inputs = reward_processing_class(
+                        text=texts,
+                        return_tensors="pt",
+                        padding=True,
+                        padding_side="right",
+                        add_special_tokens=False,
+                    )
+                    reward_inputs = super()._prepare_inputs(reward_inputs)
+                    with torch.inference_mode():
+                        rewards_per_func[:, i] = reward_func(**reward_inputs).logits[
+                            :, 0
+                        ]  # Shape (B*G,)
+                else:
+                    output_reward_func = reward_func(
+                        prompts=prompts,
+                        completions=completions,
+                        completion_ids=completion_ids_list,
+                        **reward_kwargs,
+                    )
+                    # Convert None values to NaN
+                    output_reward_func = [
+                        reward if reward is not None else torch.nan
+                        for reward in output_reward_func
+                    ]
+
+                    rewards_per_func[:, i] = torch.tensor(
+                        output_reward_func, dtype=torch.float32, device=device
+                    )
+
+        # If all reward functions return None for a given row, issue a detailed warning
+        if torch.isnan(rewards_per_func).all(dim=1).any():
+            nan_row_idx = (
+                torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0]
+            )
+            row_reward_kwargs = {
+                key: value[nan_row_idx]
+                for key, value in reward_kwargs.items()
+                if key != "trainer_state"
+            }
+            row_reward_kwargs["prompt"] = prompts[nan_row_idx]
+            row_reward_kwargs["completion"] = completions[nan_row_idx]
+            logger.warning(
+                f"All reward functions returned None for the following kwargs:\n{row_reward_kwargs}\n"
+                "Please ensure that at least one reward function returns a valid reward."
+            )
+
+        # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
+        # completions may be distributed across processes
+        rewards_per_func = gather(rewards_per_func)
+        return rewards_per_func
+
+    def _generate_single_turn(self, prompts: list):
+        device = self.accelerator.device
+
+        # Generate completions using either vLLM or regular generation
+        if self.use_vllm:
+            if self.vllm_mode == "colocate" and self.args.vllm_enable_sleep_mode:
+                # wake up colocated vLLM instances if needed
+                torch.cuda.empty_cache()  # required to avoid OOM in some cases
+                self.llm.wake_up(tags=["weights"])
+
+            # First, update the vLLM weights if needed
+            if self.state.global_step != self._last_loaded_step:
+                self._move_model_to_vllm()
+                self._last_loaded_step = self.state.global_step
+
+            if is_conversational({"prompt": prompts[0]}):
+                prompts = [
+                    prepare_multimodal_messages_vllm(prompt) for prompt in prompts
+                ]
+
+            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
+            if self.vllm_mode == "server":
+                all_prompts = gather_object(prompts)
+
+                if self.accelerator.is_main_process:
+                    # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
+                    # num_generations outputs for each one. This is faster than generating outputs for each duplicate
+                    # prompt individually.
+                    ordered_set_of_prompts = all_prompts[:: self.num_generations]
+
+                    sampling_params = {
+                        "n": self.num_generations,
+                        "repetition_penalty": self.repetition_penalty,
+                        "temperature": self.temperature,
+                        "top_p": self.top_p,
+                        "top_k": -1 if self.top_k is None else self.top_k,
+                        "min_p": 0.0 if self.min_p is None else self.min_p,
+                        "max_tokens": self.max_completion_length,
+                        "truncate_prompt_tokens": self.max_prompt_length,
+                        "guided_decoding_regex": self.guided_decoding_regex,
+                        "generation_kwargs": self.args.generation_kwargs,
+                    }
+                    with profiling_context(self, "vLLM.generate"):
+                        if self.rollout_func is not None:
+                            if is_conversational({"prompt": ordered_set_of_prompts[0]}):
+                                ordered_set_of_prompts = [
+                                    apply_chat_template(
+                                        {"prompt": p},
+                                        self.processing_class,
+                                        **self.chat_template_kwargs,
+                                    )["prompt"]
+                                    for p in ordered_set_of_prompts
+                                ]
+                            output = self.rollout_func(
+                                ordered_set_of_prompts,
+                                self.args,
+                                self.processing_class,
+                            )
+                        elif is_conversational({"prompt": ordered_set_of_prompts[0]}):
+                            output = self.vllm_client.chat(
+                                messages=ordered_set_of_prompts,
+                                **sampling_params,
+                                chat_template_kwargs=self.chat_template_kwargs,
+                            )
+                        else:
+                            output = self.vllm_client.generate(
+                                prompts=ordered_set_of_prompts, **sampling_params
+                            )
+                        # Extract required fields and collect any extra fields for reward functions
+                        required_keys = {"prompt_ids", "completion_ids", "logprobs"}
+                        extra_fields = {
+                            k: v for k, v in output.items() if k not in required_keys
+                        }
+                        payload = (
+                            output["prompt_ids"],
+                            output["completion_ids"],
+                            output["logprobs"],
+                            extra_fields,
+                        )
+                else:
+                    payload = None
+
+                # Broadcast the completions from the main process to all processes, ensuring each process receives its corresponding slice.
+                obj_list = [payload]
+                broadcast_object_list(obj_list, from_process=0)
+                all_prompt_ids, all_completion_ids, all_logprobs, all_extra_fields = (
+                    obj_list[0]
+                )
+
+                # At this point, we only get 1 copy of each prompt, so we need to repeat them num_generations times
+                all_prompt_ids = [
+                    ids for ids in all_prompt_ids for _ in range(self.num_generations)
+                ]
+
+                process_slice = slice(
+                    self.accelerator.process_index * len(prompts),
+                    (self.accelerator.process_index + 1) * len(prompts),
+                )
+                prompt_ids = all_prompt_ids[process_slice]
+                completion_ids = all_completion_ids[process_slice]
+                logprobs = all_logprobs[process_slice]
+
+                # Slice extra fields dict-of-lists per process (extra fields are per-completion, like completion_ids)
+                extra_fields = {}
+                for key, values in all_extra_fields.items():
+                    if isinstance(values, list):
+                        extra_fields[key] = values[process_slice]
+                    else:
+                        extra_fields[key] = values
+
+            # Generate completions using colocated vLLM instances: each device holds vLLM copy and work on their own batch of prompts
+            elif self.vllm_mode == "colocate":
+                if self.guided_decoding_regex:
+                    guided_decoding = GuidedDecodingParams(
+                        regex=self.guided_decoding_regex
+                    )
+                else:
+                    guided_decoding = None
+
+                generation_kwargs = {
+                    "n": 1,  # vLLM on each GPU generates only 1 in colocate mode
+                    "repetition_penalty": self.repetition_penalty,
+                    "temperature": self.temperature,
+                    "top_p": self.top_p,
+                    "top_k": -1 if self.top_k is None else self.top_k,
+                    "min_p": 0.0 if self.min_p is None else self.min_p,
+                    "max_tokens": self.max_completion_length,
+                    "truncate_prompt_tokens": self.max_prompt_length,
+                    "guided_decoding": guided_decoding,
+                    "logprobs": 0,  # enable returning log probabilities; 0 means for the sampled tokens only
+                }
+                if self.args.generation_kwargs is not None:
+                    generation_kwargs.update(self.args.generation_kwargs)
+                sampling_params = SamplingParams(**generation_kwargs)
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Gather prompts from all ranks in the TP group and flatten.
+                    # Each rank starts with its own prompts; after gathering, all ranks see the full group set.
+                    orig_size = len(prompts)
+                    gathered_prompts = [
+                        None for _ in range(self.vllm_tensor_parallel_size)
+                    ]
+                    torch.distributed.all_gather_object(
+                        gathered_prompts, prompts, group=self.tp_group
+                    )
+                    all_prompts = [p for sublist in gathered_prompts for p in sublist]
+                else:
+                    all_prompts = prompts
+
+                if self.args.vllm_enable_sleep_mode:
+                    self.llm.wake_up(tags=["kv_cache"])
+
+                with profiling_context(self, "vLLM.generate"):
+                    if is_conversational({"prompt": prompts[0]}):
+                        all_outputs = self.llm.chat(
+                            all_prompts, sampling_params=sampling_params, use_tqdm=False
+                        )
+                    else:
+                        all_outputs = self.llm.generate(
+                            all_prompts, sampling_params=sampling_params, use_tqdm=False
+                        )
+
+                all_prompt_ids = [output.prompt_token_ids for output in all_outputs]
+                all_completion_ids = [
+                    output.token_ids
+                    for outputs in all_outputs
+                    for output in outputs.outputs
+                ]
+                all_logprobs = [
+                    [next(iter(lp.values())).logprob for lp in output.logprobs]
+                    for outputs in all_outputs
+                    for output in outputs.outputs
+                ]
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Slice completions for this rank within its TP group.
+                    # Each rank generates all outputs — we keep only our share.
+                    local_rank_in_group = torch.distributed.get_rank(
+                        group=self.tp_group
+                    )
+                    tp_slice = slice(
+                        local_rank_in_group * orig_size,
+                        (local_rank_in_group + 1) * orig_size,
+                    )
+                    prompt_ids = all_prompt_ids[tp_slice]
+                    completion_ids = all_completion_ids[tp_slice]
+                    logprobs = all_logprobs[tp_slice]
+                else:
+                    prompt_ids = all_prompt_ids
+                    completion_ids = all_completion_ids
+                    logprobs = all_logprobs
+
+                extra_fields = {}  # No extra fields for colocate mode
+
+                if self.args.vllm_enable_sleep_mode:
+                    self.llm.sleep(level=2)
+
+        elif self.use_transformers_paged:
+            processor_kwargs = {
+                "max_length": self.max_prompt_length,
+                "truncation": True,
+                "add_special_tokens": False,
+            }
+            if is_conversational({"prompt": prompts[0]}):
+                processor_outputs = self.processing_class.apply_chat_template(
+                    conversation=prompts,
+                    **processor_kwargs,
+                    add_generation_prompt=True,
+                    tokenize=True,
+                    return_dict=True,
+                    **self.chat_template_kwargs,
+                )
+            else:
+                processor_outputs = self.processing_class(
+                    text=prompts, **processor_kwargs
+                )
+
+            with (
+                profiling_context(self, "transformers.generate_batch"),
+                unwrap_model_for_generation(
+                    self.model_wrapped,
+                    self.accelerator,
+                    gather_deepspeed3_params=self.args.ds3_gather_for_generation,
+                ) as unwrapped_model,
+                torch.no_grad(),
+                FSDP.summon_full_params(self.model_wrapped, recurse=False)
+                if self.is_fsdp_enabled
+                else nullcontext(),
+            ):
+                # Cast to the appropriate dtype based on training configuration
+                if self.args.bf16:
+                    unwrapped_model.to(torch.bfloat16)
+                elif self.args.fp16:
+                    unwrapped_model.to(torch.float16)
+                if self.args.cast_lm_head_to_fp32:
+                    unwrapped_model.lm_head.to(torch.float32)
+                with torch.inference_mode():
+                    # Continuous batching API expects 'inputs' arg only
+                    all_outputs = unwrapped_model.generate_batch(
+                        processor_outputs["input_ids"],
+                        generation_config=self.generation_config,
+                        progress_bar=False,
+                    )
+                    unwrapped_model.train()  # restore training mode, as generate_batch forces eval mode
+            completion_ids = [
+                output.generated_tokens for output in all_outputs.values()
+            ]
+            prompt_ids = processor_outputs["input_ids"]
+            logprobs = None  # not used in this case
+            extra_fields = {}  # No extra fields for paged mode
+
+        else:
+            # Regular generation path
+            processor_kwargs = {
+                "return_tensors": "pt",
+                "padding": True,
+                "padding_side": "left",
+                "max_length": self.max_prompt_length,
+                "truncation": True,
+                "add_special_tokens": False,
+            }
+            if is_conversational({"prompt": prompts[0]}):
+                generate_inputs = self.processing_class.apply_chat_template(
+                    conversation=prompts,
+                    **processor_kwargs,
+                    add_generation_prompt=True,
+                    tokenize=True,
+                    return_dict=True,
+                    **self.chat_template_kwargs,
+                )
+            else:
+                generate_inputs = self.processing_class(
+                    text=prompts, **processor_kwargs
+                )
+            generate_inputs = super()._prepare_inputs(generate_inputs)
+
+            with (
+                profiling_context(self, "transformers.generate"),
+                unwrap_model_for_generation(
+                    self.model_wrapped,
+                    self.accelerator,
+                    gather_deepspeed3_params=self.args.ds3_gather_for_generation,
+                ) as unwrapped_model,
+                torch.no_grad(),
+                FSDP.summon_full_params(self.model_wrapped, recurse=False)
+                if self.is_fsdp_enabled
+                else nullcontext(),
+            ):
+                prompt_completion_ids = unwrapped_model.generate(
+                    **generate_inputs,
+                    generation_config=self.generation_config,
+                    disable_compile=True,
+                )
+            # Compute prompt length and extract completion ids
+            prompt_ids, prompt_mask = (
+                generate_inputs["input_ids"],
+                generate_inputs["attention_mask"],
+            )
+            prompt_length = prompt_ids.size(1)
+            completion_ids = prompt_completion_ids[:, prompt_length:]
+
+            # Mask everything after the first EOS token
+            is_eos = completion_ids == self.eos_token_id
+            eos_idx = torch.full(
+                (is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device
+            )
+            eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
+            sequence_indices = torch.arange(is_eos.size(1), device=device).expand(
+                is_eos.size(0), -1
+            )
+            completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
+            prompt_ids = [
+                p[m].tolist()
+                for p, m in zip(prompt_ids, prompt_mask.bool(), strict=True)
+            ]
+            completion_ids = [
+                c[m].tolist()
+                for c, m in zip(completion_ids, completion_mask.bool(), strict=True)
+            ]
+            logprobs = None  # not used in this case
+            extra_fields = {}  # No extra fields for non-rollout_func paths
+
+        return prompt_ids, completion_ids, logprobs, extra_fields
+
+    def _generate(self, prompts: list):
+        device = self.accelerator.device
+        mode = "train" if self.model.training else "eval"
+
+        prompt_ids, completion_ids, logprobs, extra_fields = self._generate_single_turn(
+            prompts
+        )
+
+        # Get completion length per sequence, used for logging
+        prompt_lengths = torch.tensor([len(ids) for ids in prompt_ids], device=device)
+        completion_lengths = torch.tensor(
+            [len(ids) for ids in completion_ids], device=device
+        )
+        agg_prompt_lengths = self.accelerator.gather(prompt_lengths)
+        agg_completion_lengths = self.accelerator.gather(completion_lengths)
+        total_prompt_tokens = agg_prompt_lengths.sum()
+        total_completion_tokens = (
+            agg_completion_lengths.sum()
+        )  # = num_items_in_batch, required for the DAPO loss
+
+        # Log the metrics
+        if mode == "train":
+            self.state.num_input_tokens_seen += (
+                total_prompt_tokens + total_completion_tokens
+            ).item()
+        self._metrics[mode]["num_tokens"] = [self.state.num_input_tokens_seen]
+
+        # Log completion lengths, mean, min, max
+        self._metrics[mode]["completions/mean_length"].append(
+            agg_completion_lengths.float().mean().item()
+        )
+        self._metrics[mode]["completions/min_length"].append(
+            agg_completion_lengths.float().min().item()
+        )
+        self._metrics[mode]["completions/max_length"].append(
+            agg_completion_lengths.float().max().item()
+        )
+
+        # Identify sequences that terminated with EOS and log their lengths
+        eos_and_pad = [self.eos_token_id, self.pad_token_id]
+        is_truncated = torch.tensor(
+            [ids[-1] not in eos_and_pad for ids in completion_ids], device=device
+        )
+        agg_is_truncated = self.accelerator.gather(is_truncated)
+        self._metrics[mode]["completions/clipped_ratio"].append(
+            agg_is_truncated.float().mean().item()
+        )
+        term_completion_lengths = agg_completion_lengths[~agg_is_truncated]
+        if (
+            len(term_completion_lengths) == 0
+        ):  # edge case where no terminated sequences are found
+            term_completion_lengths = torch.zeros(1, device=device)
+        self._metrics[mode]["completions/mean_terminated_length"].append(
+            term_completion_lengths.float().mean().item()
+        )
+        self._metrics[mode]["completions/min_terminated_length"].append(
+            term_completion_lengths.float().min().item()
+        )
+        self._metrics[mode]["completions/max_terminated_length"].append(
+            term_completion_lengths.float().max().item()
+        )
+
+        return (
+            prompt_ids,
+            completion_ids,
+            total_completion_tokens,
+            logprobs,
+            extra_fields,
+        )
+
+    def _generate_and_score_completions(
+        self, inputs: list[dict[str, torch.Tensor | Any]]
+    ) -> dict[str, torch.Tensor | Any]:
+        device = self.accelerator.device
+        mode = "train" if self.model.training else "eval"
+
+        prompts = [x["prompt"] for x in inputs]
+
+        if "images" in inputs[0]:
+            images = [example.get("images") for example in inputs]
+        elif "image" in inputs[0]:
+            images = [
+                [example.get("image")] if example.get("image") is not None else None
+                for example in inputs
+            ]
+        else:
+            images = None
+        # Transformers requires at least one image in the batch, otherwise it throws an error
+        if images is not None and all(img_list == [] for img_list in images):
+            images = None
+
+        # If the prompts are conversational and the inputs contain images, we need to convert the prompts from
+        # [{"role": "user", "content": "What color is the sky?"}] to
+        # [{"role": "user", "content": [{"type": "image", "image": <Image>}, {"type": "text", "text": "What color is the sky?"}]}]
+        if images is not None:
+            prompts = [
+                prepare_multimodal_messages(prompt, image_list)
+                for prompt, image_list in zip(prompts, images, strict=True)
+            ]
+
+        (
+            prompt_ids_list,
+            completion_ids_list,
+            num_items_in_batch,
+            sampling_per_token_logps_list,
+            extra_fields,
+        ) = self._generate(prompts)
+
+        # Convert lists of token IDs to padded tensors
+        prompt_ids = [torch.tensor(ids, device=device) for ids in prompt_ids_list]
+        prompt_mask = [torch.ones_like(ids, dtype=torch.long) for ids in prompt_ids]
+        prompt_ids = pad(
+            prompt_ids, padding_value=self.pad_token_id, padding_side="left"
+        )
+        prompt_mask = pad(prompt_mask, padding_value=0, padding_side="left")
+        completion_ids = [
+            torch.tensor(ids, device=device) for ids in completion_ids_list
+        ]
+        completion_mask = [
+            torch.ones_like(ids, dtype=torch.long) for ids in completion_ids
+        ]
+        completion_ids = pad(
+            completion_ids, padding_value=self.pad_token_id, padding_side="right"
+        )
+        completion_mask = pad(completion_mask, padding_value=0, padding_side="right")
+        if sampling_per_token_logps_list is not None:
+            sampling_per_token_logps = [
+                torch.tensor(logps, device=device)
+                for logps in sampling_per_token_logps_list
+            ]
+            sampling_per_token_logps = pad(
+                sampling_per_token_logps, padding_value=0.0, padding_side="right"
+            )
+        else:
+            sampling_per_token_logps = None
+
+        # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
+        if self.mask_truncated_completions:
+            eos_and_pad = [self.eos_token_id, self.pad_token_id]
+            is_truncated = torch.tensor(
+                [ids[-1] not in eos_and_pad for ids in completion_ids_list],
+                device=device,
+            )
+            completion_mask = completion_mask * (~is_truncated).unsqueeze(1).int()
+
+        # Concatenate prompt_mask with completion_mask for logit computation
+        prompt_completion_ids = torch.cat(
+            [prompt_ids, completion_ids], dim=1
+        )  # (B, P+C)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
+
+        logits_to_keep = completion_ids.size(
+            1
+        )  # we only need to compute the logits for the completion tokens
+        batch_size = (
+            self.args.per_device_train_batch_size
+            if mode == "train"
+            else self.args.per_device_eval_batch_size
+        )
+
+        num_images = (
+            [len(img_list) for img_list in images] if images is not None else None
+        )
+
+        # Get forward_kwargs for models with multimodal inputs
+        if images is not None:
+            prompts_text = [
+                apply_chat_template(
+                    {"prompt": prompt},
+                    self.processing_class,
+                    **self.chat_template_kwargs,
+                )["prompt"]
+                for prompt in prompts
+            ]
+            prompt_inputs = self.processing_class(
+                images=images, text=prompts_text, padding=True, return_tensors="pt"
+            )
+            prompt_inputs = super()._prepare_inputs(prompt_inputs)
+            forward_kwargs = {
+                k: v
+                for k, v in prompt_inputs.items()
+                if k not in ["input_ids", "attention_mask"]
+            }
+        else:
+            forward_kwargs = {}
+
+        # If token_type_ids are used, extend them with zeros for the completion part
+        if "token_type_ids" in forward_kwargs:
+            token_type_ids = forward_kwargs["token_type_ids"]
+            forward_kwargs["token_type_ids"] = torch.cat(
+                [token_type_ids, token_type_ids.new_zeros(completion_ids.shape)], dim=1
+            )
+
+        with torch.no_grad():
+            # If the generation and optimization steps are misaligned—i.e., if generation does not occur at the end of
+            # a full optimizer step (when gradient_accumulation_steps is not a multiple of generate_every)—then the
+            # samples may come from an earlier version of the model. In that case, we need to track old_per_token_logps
+            # for importance sampling. If the steps are aligned, importance sampling isn't necessary and we set
+            # old_per_token_logps to None.
+            # When using vLLM, we always compute old_per_token_logps for importance sampling, it was shown that the
+            # distribution mismatch between vLLM and the training model can be large and harm the training.
+            generate_every = (
+                self.args.steps_per_generation * self.num_iterations
+            )  # generation frequency
+            if self.args.gradient_accumulation_steps % generate_every != 0 or (
+                self.use_vllm and self.vllm_importance_sampling_correction
+            ):
+                old_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                    self.model,
+                    prompt_completion_ids,
+                    attention_mask,
+                    logits_to_keep,
+                    batch_size,
+                    num_images=num_images,
+                    **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                )
+            else:
+                old_per_token_logps = None
+
+            # Compute the importance sampling ratio when using vLLM, to correct for potential distribution mismatch
+            if self.use_vllm and self.vllm_importance_sampling_correction:
+                importance_sampling_ratio = torch.exp(
+                    old_per_token_logps - sampling_per_token_logps
+                )
+                importance_sampling_ratio = torch.clamp(
+                    importance_sampling_ratio, max=self.vllm_importance_sampling_cap
+                )
+
+            # Compute the per-token log probabilities for the reference model
+            if self.beta != 0.0:
+                if self.ref_model is not None:
+                    ref_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                        self.ref_model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        logits_to_keep,
+                        batch_size=batch_size,
+                        num_images=num_images,
+                        **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                    )
+                else:
+                    with self.accelerator.unwrap_model(self.model).disable_adapter():
+                        ref_per_token_logps, _ = (
+                            self._get_per_token_logps_and_entropies(
+                                self.model,
+                                prompt_completion_ids,
+                                attention_mask,
+                                logits_to_keep,
+                                batch_size=batch_size,
+                                num_images=num_images,
+                                **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                            )
+                        )
+            else:
+                ref_per_token_logps = None
+
+        # Decode
+        prompts_text = self.processing_class.batch_decode(
+            prompt_ids, skip_special_tokens=True
+        )
+        completions_text = self.processing_class.batch_decode(
+            completion_ids, skip_special_tokens=True
+        )
+        if is_conversational(inputs[0]):
+            completions = []
+            for prompt, completion in zip(prompts, completions_text, strict=True):
+                bootstrap = (
+                    prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+                )
+                if isinstance(
+                    bootstrap, list
+                ):  # for VLM, the format might be [{"type": "text", "text": "..."}]
+                    assert len(bootstrap) == 1 and bootstrap[0]["type"] == "text"
+                    bootstrap = bootstrap[0]["text"]
+                completions.append(
+                    [{"role": "assistant", "content": bootstrap + completion}]
+                )
+        else:
+            completions = completions_text
+
+        # Merge extra_fields from rollout_func into inputs for reward functions
+        if extra_fields:
+            for i, inp in enumerate(inputs):
+                for key, values in extra_fields.items():
+                    if isinstance(values, list) and i < len(values):
+                        inp[key] = values[i]
+                    elif not isinstance(values, list):
+                        inp[key] = values
+
+        # Calculate rewards for each reward function. rewards_per_func aggregates rewards across all processes. This is
+        # important because rewards will be normalized per group, and completions are distributed. We will later slice
+        # rewards_per_func to extract each process's subset.
+        rewards_per_func = self._calculate_rewards(
+            inputs, prompts, completions, completion_ids_list
+        )
+
+        # Apply weights to each reward function's output and sum
+        rewards = (
+            rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
+        ).nansum(dim=1)
+
+        # Compute grouped-wise rewards
+        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
+
+        # Normalize the rewards to compute the advantages
+        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(
+            self.num_generations, dim=0
+        )
+        advantages = rewards - mean_grouped_rewards
+
+        if self.scale_rewards in ["group", "none"]:
+            # If self.scale_rewards = "none", we'll still log group level std
+            std_rewards = rewards.view(-1, self.num_generations).std(dim=1)
+            std_rewards = std_rewards.repeat_interleave(self.num_generations, dim=0)
+        elif self.scale_rewards == "batch":
+            # Compute global std
+            std_rewards = rewards.std().expand_as(rewards)
+        else:
+            raise ValueError(
+                f"Invalid value for scale_rewards: {self.scale_rewards}. Must be one of 'batch', 'group', or 'none'."
+            )
+
+        is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards))
+        if self.scale_rewards != "none":
+            advantages = advantages / (std_rewards + 1e-4)
+
+        # Slice to keep only the local part of the data
+        process_slice = slice(
+            self.accelerator.process_index * len(prompts),
+            (self.accelerator.process_index + 1) * len(prompts),
+        )
+        all_process_advantages = (
+            advantages.clone()
+        )  # keep the aggregated advantages for logging
+        advantages = advantages[process_slice]
+
+        # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values)
+        for i, reward_func_name in enumerate(self.reward_func_names):
+            mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards)
+            std_func_rewards = nanstd(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/std"].append(
+                std_func_rewards
+            )
+        self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item())
+        self._metrics[mode]["reward_std"].append(std_rewards.mean().item())
+        self._metrics[mode]["frac_reward_zero_std"].append(
+            is_std_zero.float().mean().item()
+        )
+
+        # Log prompt and completion texts
+        self._logs["prompt"].extend(gather_object(prompts_text))
+        self._logs["completion"].extend(gather_object(completions_text))
+        for i, name in enumerate(self.reward_func_names):
+            self._logs["rewards"][name].extend(rewards_per_func[:, i].tolist())
+        self._logs["advantages"].extend(all_process_advantages.tolist())
+
+        if images is not None:
+            self._logs["images"].extend(gather_object(images))
+
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            delta = torch.abs(old_per_token_logps - sampling_per_token_logps)
+            delta = delta[completion_mask.bool()]
+            mean_delta = (
+                torch.mean(delta)
+                if delta.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            max_delta = (
+                torch.max(delta)
+                if delta.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            self._metrics[mode]["sampling/sampling_logp_difference/mean"].append(
+                self.accelerator.gather(mean_delta).mean().item()
+            )
+            self._metrics[mode]["sampling/sampling_logp_difference/max"].append(
+                self.accelerator.gather(max_delta).max().item()
+            )
+
+            flat_is_ratio = importance_sampling_ratio[completion_mask.bool()]
+            min_importance_sampling_ratio = (
+                torch.min(flat_is_ratio)
+                if flat_is_ratio.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            mean_importance_sampling_ratio = (
+                torch.mean(flat_is_ratio)
+                if flat_is_ratio.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            max_importance_sampling_ratio = (
+                torch.max(flat_is_ratio)
+                if flat_is_ratio.numel() > 0
+                else torch.tensor(0.0, device=device)
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/min"].append(
+                nanmin(self.accelerator.gather(min_importance_sampling_ratio)).item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/mean"].append(
+                self.accelerator.gather(mean_importance_sampling_ratio).nanmean().item()
+            )
+            self._metrics[mode]["sampling/importance_sampling_ratio/max"].append(
+                nanmax(self.accelerator.gather(max_importance_sampling_ratio)).item()
+            )
+
+        output = {
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "advantages": advantages,
+            "num_items_in_batch": num_items_in_batch,
+        }
+        if old_per_token_logps is not None:
+            output["old_per_token_logps"] = old_per_token_logps
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            output["importance_sampling_ratio"] = importance_sampling_ratio
+        if ref_per_token_logps is not None:
+            output["ref_per_token_logps"] = ref_per_token_logps
+        if "pixel_values" in forward_kwargs:
+            output["pixel_values"] = forward_kwargs["pixel_values"]
+        if "image_grid_thw" in forward_kwargs:
+            output["image_grid_thw"] = forward_kwargs["image_grid_thw"]
+        if "pixel_attention_mask" in forward_kwargs:
+            output["pixel_attention_mask"] = forward_kwargs["pixel_attention_mask"]
+        if "image_sizes" in forward_kwargs:
+            output["image_sizes"] = forward_kwargs["image_sizes"]
+        if "token_type_ids" in forward_kwargs:
+            output["token_type_ids"] = forward_kwargs["token_type_ids"]
+        if images is not None:
+            output["num_images"] = num_images
+        return output
+
+    def compute_liger_loss(self, unwrapped_model, inputs):
+        # Compute the per-token log probabilities for the model
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = (
+            inputs["completion_ids"],
+            inputs["completion_mask"],
+        )
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        logits_to_keep = completion_ids.size(
+            1
+        )  # we only need to compute the logits for the completion tokens
+
+        # Get the last hidden state of the model
+        last_hidden_state = self._get_last_hidden_state(
+            unwrapped_model,
+            input_ids,
+            attention_mask,
+            logits_to_keep,
+            inputs.get("pixel_values"),
+            inputs.get("image_grid_thw"),
+            inputs.get("pixel_attention_mask"),
+            inputs.get("image_sizes"),
+        )
+
+        # compute loss and metrics using liger grpo loss
+        loss, metrics = self.liger_grpo_loss(
+            _input=last_hidden_state,
+            lin_weight=unwrapped_model.lm_head.weight,
+            selected_token_ids=completion_ids,
+            attention_mask=completion_mask,
+            advantages=inputs["advantages"],
+            bias=unwrapped_model.lm_head.bias,
+            old_per_token_logps=inputs.get("old_per_token_logps"),
+            ref_per_token_logps=inputs.get("ref_per_token_logps"),
+        )
+        # Extract metrics from the liger_grpo_loss output
+        # KL divergence is the first metric when beta is non-zero
+        mean_kl = metrics[0] if self.beta != 0.0 else None
+        clip_ratio = metrics[-1]
+
+        mode = "train" if self.model.training else "eval"
+        if self.beta != 0.0:
+            self._metrics[mode]["kl"].append(
+                self.accelerator.gather(mean_kl).mean().item()
+            )
+        self._metrics[mode]["clip_ratio"].append(
+            self.accelerator.gather(clip_ratio).mean().item()
+        )
+        return loss / self.current_gradient_accumulation_steps
+
+    @profiling_decorator
+    def compute_loss(
+        self, model, inputs, return_outputs=False, num_items_in_batch=None
+    ):
+        if return_outputs:
+            raise ValueError("The GRPOTrainer does not support returning outputs")
+        if self.use_liger_kernel:
+            # Compute the loss using the liger grpo loss
+            unwrapped_model = self.accelerator.unwrap_model(model)
+            return self._forward_redirection(
+                model, unwrapped_model, self.compute_liger_loss, unwrapped_model, inputs
+            )
+        return self._compute_loss(model, inputs)
+
+    def _compute_loss(self, model, inputs):
+        # Compute the per-token log probabilities for the model
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = (
+            inputs["completion_ids"],
+            inputs["completion_mask"],
+        )
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        logits_to_keep = completion_ids.size(
+            1
+        )  # we only need to compute the logits for the completion tokens
+
+        # Compute the per_token_logps and the entropy at each position in the completion
+        per_token_logps, entropies = self._get_per_token_logps_and_entropies(
+            model,
+            input_ids,
+            attention_mask,
+            logits_to_keep,
+            compute_entropy=True,
+            pixel_values=inputs.get("pixel_values"),
+            image_grid_thw=inputs.get("image_grid_thw"),
+            num_images=inputs.get("num_images"),
+            pixel_attention_mask=inputs.get("pixel_attention_mask"),
+            image_sizes=inputs.get("image_sizes"),
+            token_type_ids=inputs.get("token_type_ids"),
+        )
+
+        if self.top_entropy_quantile < 1.0:
+            entropy_mask = self.get_high_entropy_mask(
+                entropies, completion_mask, 1 - self.top_entropy_quantile
+            )
+        else:
+            entropy_mask = None
+
+        # Compute the KL divergence between the model and the reference model
+        if self.beta != 0.0:
+            ref_per_token_logps = inputs["ref_per_token_logps"]
+            per_token_kl = (
+                torch.exp(ref_per_token_logps - per_token_logps)
+                - (ref_per_token_logps - per_token_logps)
+                - 1
+            )
+
+        # Compute the loss
+        advantages = inputs["advantages"]
+        # When num_iterations == 1 and steps_per_generation <= gradient_accumulation_steps,
+        # old_per_token_logps == per_token_logps. In this case we can skip its computation
+        # (see _generate_and_score_completions) and instead use per_token_logps.detach().
+        # The exception is when using vLLM, where we always compute old_per_token_logps
+        # for importance sampling
+        old_per_token_logps = inputs.get("old_per_token_logps")
+        old_per_token_logps = (
+            per_token_logps.detach()
+            if old_per_token_logps is None
+            else old_per_token_logps
+        )
+
+        log_ratio = per_token_logps - old_per_token_logps
+        if self.importance_sampling_level == "token":
+            log_importance_weights = log_ratio
+        elif self.importance_sampling_level == "sequence":
+            log_importance_weights = (log_ratio * completion_mask).sum(
+                -1
+            ) / completion_mask.sum(-1).clamp(min=1.0)
+            log_importance_weights = log_importance_weights.unsqueeze(-1)
+        else:
+            raise ValueError(
+                f"Unknown importance sampling level: {self.importance_sampling_level}. Possible values are 'token' "
+                "and 'sequence'."
+            )
+
+        coef_1 = torch.exp(log_importance_weights)
+
+        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
+        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
+        if self.loss_type == "cispo":
+            clamped_ratios = torch.clamp(coef_1, max=self.epsilon_high).detach()
+            per_token_loss = -clamped_ratios * advantages.unsqueeze(1) * per_token_logps
+        elif self.loss_type in ["grpo", "bnpo", "dr_grpo", "dapo"]:
+            coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
+            # Two-sided clipping
+            if self.args.delta is not None:
+                coef_1 = torch.clamp(coef_1, max=self.args.delta)
+
+            per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+            per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+            per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}")
+
+        if entropy_mask is not None:
+            per_token_loss = per_token_loss * entropy_mask
+
+        if self.use_vllm and self.vllm_importance_sampling_correction:
+            per_token_loss = per_token_loss * inputs["importance_sampling_ratio"]
+
+        if self.beta != 0.0:
+            per_token_loss = per_token_loss + self.beta * per_token_kl
+
+        if self.loss_type == "grpo":
+            loss = (
+                (per_token_loss * completion_mask).sum(-1)
+                / completion_mask.sum(-1).clamp(min=1.0)
+            ).mean()
+            loss = loss / self.current_gradient_accumulation_steps
+        elif self.loss_type == "bnpo":
+            loss = (
+                per_token_loss * completion_mask
+            ).sum() / completion_mask.sum().clamp(min=1.0)
+            loss = loss / self.current_gradient_accumulation_steps
+        elif self.loss_type == "dr_grpo":
+            loss = (per_token_loss * completion_mask).sum() / (
+                per_token_loss.size(0) * self.max_completion_length
+            )
+            loss = loss / self.current_gradient_accumulation_steps
+        elif self.loss_type in ["cispo", "dapo"]:
+            normalizer = inputs["num_items_in_batch"] / self.accelerator.num_processes
+            loss = (per_token_loss * completion_mask).sum() / normalizer
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}")
+
+        # Log the metrics
+        mode = "train" if self.model.training else "eval"
+
+        completion_token_count = completion_mask.sum().clamp(min=1.0)
+
+        def masked_batch_mean(x):
+            if x.shape[1] == 1:  # when importance_sampling_level == "sequence"
+                return x.mean()
+            return (x * completion_mask).sum() / completion_token_count
+
+        if self.beta != 0.0:
+            mean_kl = masked_batch_mean(per_token_kl)
+            self._metrics[mode]["kl"].append(
+                self.accelerator.gather(mean_kl).nanmean().item()
+            )
+
+        mean_entropy = masked_batch_mean(entropies)
+        self._metrics[mode]["entropy"].append(
+            self.accelerator.gather(mean_entropy).nanmean().item()
+        )
+
+        if self.loss_type in ["grpo", "bnpo", "dr_grpo", "dapo"]:
+            # Compute the clipped probability ratios
+            is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (
+                advantages.unsqueeze(1) < 0
+            )
+            is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (
+                advantages.unsqueeze(1) > 0
+            )
+            is_region_clipped = is_low_clipped | is_high_clipped
+
+            low_clip = masked_batch_mean(is_low_clipped.float())
+            high_clip = masked_batch_mean(is_high_clipped.float())
+            clip_ratio = masked_batch_mean(is_region_clipped.float())
+
+            gathered_low_clip = self.accelerator.gather(low_clip)
+            self._metrics[mode]["clip_ratio/low_mean"].append(
+                gathered_low_clip.nanmean().item()
+            )
+            self._metrics[mode]["clip_ratio/low_min"].append(
+                nanmin(gathered_low_clip).item()
+            )
+            gathered_high_clip = self.accelerator.gather(high_clip)
+            self._metrics[mode]["clip_ratio/high_mean"].append(
+                gathered_high_clip.nanmean().item()
+            )
+            self._metrics[mode]["clip_ratio/high_max"].append(
+                nanmax(gathered_high_clip).item()
+            )
+            gathered_clip_ratio = self.accelerator.gather(clip_ratio)
+            self._metrics[mode]["clip_ratio/region_mean"].append(
+                gathered_clip_ratio.nanmean().item()
+            )
+        elif self.loss_type == "cispo":
+            is_cispo_clipped = (coef_1 > self.epsilon_high) & (
+                advantages.unsqueeze(1) > 0
+            )
+            cispo_clip_ratio = masked_batch_mean(is_cispo_clipped.float())
+            gathered_cispo_clip_ratio = self.accelerator.gather(cispo_clip_ratio)
+            self._metrics[mode]["cispo_clip_ratio"].append(
+                gathered_cispo_clip_ratio.nanmean().item()
+            )
+
+        return loss
+
+    def prediction_step(
+        self, model, inputs, prediction_loss_only, ignore_keys: list[str] | None = None
+    ):
+        inputs = self._prepare_inputs(inputs)
+        with torch.no_grad():
+            with self.compute_loss_context_manager():
+                loss = self.compute_loss(model, inputs)
+            loss = loss.mean().detach()
+        return loss, None, None
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        mode = "train" if self.model.training else "eval"
+        metrics = {
+            key: sum(val) / len(val) for key, val in self._metrics[mode].items()
+        }  # average the metrics
+
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if mode == "eval":
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+
+        logs = {**logs, **metrics}
+        super().log(logs, start_time)
+        self._metrics[mode].clear()
+
+        if self.accelerator.is_main_process and self.log_completions:
+            if is_rich_available():
+                print_prompt_completions_sample(
+                    self._logs["prompt"],
+                    self._logs["completion"],
+                    self._logs["rewards"],
+                    self._logs["advantages"],
+                    self.state.global_step,
+                    self.num_completions_to_print,
+                )
+
+            logging_backends = []
+            if (
+                self.args.report_to
+                and "wandb" in self.args.report_to
+                and wandb.run is not None
+            ):
+                logging_backends.append(wandb)
+            if self.args.report_to and "trackio" in self.args.report_to:
+                logging_backends.append(trackio)
+
+            table = {
+                "step": [str(self.state.global_step)] * len(self._logs["prompt"]),
+                "prompt": self._logs["prompt"],
+                "completion": self._logs["completion"],
+                **self._logs["rewards"],
+                "advantage": self._logs["advantages"],
+            }
+
+            df_base = pd.DataFrame(table)
+            images_raw = self._logs["images"] or []
+
+            for logging_backend in logging_backends:
+                if images_raw:
+                    images = []
+                    for image_list in self._logs["images"]:
+                        images.append(
+                            [logging_backend.Image(image) for image in image_list]
+                        )
+                    df = pd.concat(
+                        [df_base, pd.Series(images, name="image")],
+                        axis=1,
+                        copy=False,
+                    )
+                else:
+                    df = df_base
+
+                if self.wandb_log_unique_prompts:
+                    df = df.drop_duplicates(subset=["prompt"])
+
+                logging_backend.log(
+                    {"completions": logging_backend.Table(dataframe=df)}
+                )
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/trainer/judges.py b/src/aixpert/training/training/trl/trainer/judges.py
new file mode 100644
index 0000000..1cb8fba
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/judges.py
@@ -0,0 +1,104 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from ..experimental.judges import AllTrueJudge as _AllTrueJudge
+from ..experimental.judges import BaseBinaryJudge as _BaseBinaryJudge
+from ..experimental.judges import BaseJudge as _BaseJudge
+from ..experimental.judges import BasePairwiseJudge as _BasePairwiseJudge
+from ..experimental.judges import BaseRankJudge as _BaseRankJudge
+from ..experimental.judges import HfPairwiseJudge as _HfPairwiseJudge
+from ..experimental.judges import OpenAIPairwiseJudge as _OpenAIPairwiseJudge
+from ..experimental.judges import PairRMJudge as _PairRMJudge
+
+
+class AllTrueJudge(_AllTrueJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `AllTrueJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import AllTrueJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class BaseBinaryJudge(_BaseBinaryJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `BaseBinaryJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import BaseBinaryJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class BaseJudge(_BaseJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `BaseJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import BaseJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class BasePairwiseJudge(_BasePairwiseJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `BasePairwiseJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import BasePairwiseJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class BaseRankJudge(_BaseRankJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `BaseRankJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import BaseRankJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class HfPairwiseJudge(_HfPairwiseJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `HfPairwiseJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import HfPairwiseJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class OpenAIPairwiseJudge(_OpenAIPairwiseJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `OpenAIPairwiseJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import OpenAIPairwiseJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
+
+
+class PairRMJudge(_PairRMJudge):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `PairRMJudge` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.judges import PairRMJudge`. The current import path will be removed and no "
+            "longer supported in TRL 0.29."
+        )
+        super().__init__(*args, **kwargs)
diff --git a/src/aixpert/training/training/trl/trainer/kto_config.py b/src/aixpert/training/training/trl/trainer/kto_config.py
new file mode 100644
index 0000000..e89856a
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/kto_config.py
@@ -0,0 +1,270 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class KTOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`KTOTrainer`].
+
+    This class includes only the parameters that are specific to KTO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_completion_length (`int`, *optional*):
+            Maximum length of the completion. This argument is required if you want to use the default data collator
+            and your model is an encoder-decoder.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model.
+        loss_type (`str`, *optional*, defaults to `"kto"`):
+            Type of loss to use. Possible values are:
+
+                - `"kto"`: KTO loss from the [KTO](https://huggingface.co/papers/2402.01306) paper.
+                - `"apo_zero_unpaired"`: Unpaired variant of APO-zero loss from the
+                  [APO](https://huggingface.co/papers/2408.06266) paper.
+
+        desirable_weight (`float`, *optional*, defaults to `1.0`):
+            Desirable losses are weighed by this factor to counter unequal number of desirable and undesirable paris.
+        undesirable_weight (`float`, *optional*, defaults to `1.0`):
+            Undesirable losses are weighed by this factor to counter unequal number of desirable and undesirable pairs.
+        label_pad_token_id (`int`, *optional*, defaults to `-100`):
+            Label pad token id. This argument is required if you want to use the default data collator.
+        padding_value (`int`, *optional*):
+            Padding value to use. If `None`, the padding value of the tokenizer is used.
+        truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from both the model and the reference model to W&B or Comet
+            during evaluation.
+        is_encoder_decoder (`bool`, *optional*):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
+            Whether to precompute reference model log probabilities for training and evaluation datasets. This is
+            useful when training without the reference model to reduce the total GPU memory needed.
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        ref_model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
+            from a string.
+        dataset_num_proc: (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model and reference model.
+        use_liger_loss (`bool`, *optional*):
+            Whether to use Liger loss.
+
+            <Deprecated version="0.25.0">
+
+            Parameter `use_liger_loss` is deprecated and will be removed in version 0.28.0. Use `use_liger_kernel`
+            instead.
+
+            </Deprecated>
+
+        base_model_attribute_name (`str`, *optional*, defaults to `"model"`):
+            Name of the attribute in the model that contains the base model. This is used to get the base model from
+            the model when the model does not have a `get_decoder` method in the case when `use_liger_kernel` is
+            `True`.
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + [
+        "model_init_kwargs",
+        "ref_model_init_kwargs",
+    ]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    max_length: int | None = field(
+        default=1024,
+        metadata={
+            "help": "Maximum length of the sequences (prompt + completion) in the batch."
+        },
+    )
+    max_prompt_length: int | None = field(
+        default=512,
+        metadata={
+            "help": "Maximum length of the prompt. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    max_completion_length: int | None = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from "
+            "the reference model."
+        },
+    )
+    loss_type: str = field(
+        default="kto",
+        metadata={
+            "help": "Type of loss to use.",
+            "choices": ["kto", "apo_zero_unpaired"],
+        },
+    )
+    desirable_weight: float = field(
+        default=1.0,
+        metadata={
+            "help": "Desirable losses are weighed by this factor to counter unequal number of desirable and "
+            "undesirable pairs.",
+        },
+    )
+    undesirable_weight: float = field(
+        default=1.0,
+        metadata={
+            "help": "Undesirable losses are weighed by this factor to counter unequal number of desirable and "
+            "undesirable pairs.",
+        },
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={
+            "help": "Label pad token id. This argument is required if you want to use the default data collator."
+        },
+    )
+    padding_value: int | None = field(
+        default=None,
+        metadata={
+            "help": "Padding value to use. If `None`, the padding value of the tokenizer is used."
+        },
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long.",
+            "choices": ["keep_end", "keep_start"],
+        },
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={
+            "help": "If `True`, generates and logs completions from both the model and the reference model to W&B "
+            "during evaluation."
+        },
+    )
+    is_encoder_decoder: bool | None = field(
+        default=None,
+        metadata={
+            "help": "When using the `model_init` argument (callable) to instantiate the model instead of the `model` "
+            "argument, you need to specify if the model returned by the callable is an encoder-decoder model."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+    precompute_ref_log_probs: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to precompute reference model log probabilities for training and evaluation datasets. "
+            "This is useful when training without the reference model to reduce the total GPU memory needed."
+        },
+    )
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    ref_model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "reference model from a string."
+        },
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    use_liger_loss: bool = field(
+        default=None,
+        metadata={
+            "help": "Whether to use Liger loss. It requires liger-kernel to be installed."
+        },
+    )
+    base_model_attribute_name: str = field(
+        default="model",
+        metadata={
+            "help": "Name of the attribute in the model that contains the base model. This is used to get the base "
+            "model from the model when the model does not have a `get_decoder` method in the case when "
+            "`use_liger_kernel` is `True`."
+        },
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        if self.use_liger_loss is not None:
+            warnings.warn(
+                "The `use_liger_loss` argument is deprecated and will be removed in version 0.28.0. Please use "
+                "`use_liger_kernel` instead.",
+                FutureWarning,
+                stacklevel=2,
+            )
+            self.use_liger_kernel = self.use_liger_loss
+
+        super().__post_init__()
diff --git a/src/aixpert/training/training/trl/trainer/kto_trainer.py b/src/aixpert/training/training/trl/trainer/kto_trainer.py
new file mode 100644
index 0000000..f51b6b2
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/kto_trainer.py
@@ -0,0 +1,1983 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import random
+import textwrap
+import warnings
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import contextmanager, nullcontext
+from operator import itemgetter
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from accelerate import PartialState, logging
+from accelerate.utils import tqdm
+from datasets import Dataset, concatenate_datasets
+from torch import autocast, nn
+from torch.utils.data import DataLoader, SequentialSampler
+from transformers import (
+    AutoModelForCausalLM,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainerCallback,
+    TrainingArguments,
+    is_comet_available,
+    is_wandb_available,
+)
+from transformers.trainer_utils import EvalLoopOutput, has_length
+from transformers.utils import is_peft_available
+
+from ..data_utils import (
+    maybe_apply_chat_template,
+    maybe_extract_prompt,
+    maybe_unpair_preference_dataset,
+)
+from ..import_utils import is_liger_kernel_available
+from ..models import create_reference_model, prepare_deepspeed
+from .base_trainer import BaseTrainer
+from .kto_config import KTOConfig
+from .utils import (
+    DPODataCollatorWithPadding,
+    disable_dropout_in_model,
+    log_table_to_comet_experiment,
+    pad_to_length,
+    peft_module_casting_to_bf16,
+    selective_log_softmax,
+)
+
+
+if is_liger_kernel_available():
+    from liger_kernel.chunked_loss import LigerFusedLinearKTOLoss
+
+if is_peft_available():
+    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
+
+if is_wandb_available():
+    import wandb
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, PreTrainedTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+RUNNING_NAME = "running.pt"
+
+
+def _get_kl_dataset(batch: dict[str, list[Any]]) -> dict[str, list[Any]]:
+    """
+    Creates mismatched pairs of prompts and completions for the KL dataset by adding a +1 offset to the order of
+    completions. For best results, the mismatched outputs y' used to estimate the KL term for a batch should be the
+    same set as the matched outputs y used to estimate the rewards in that batch, just paired with different x.
+    """
+    batch["answer_input_ids"] = [batch["answer_input_ids"][-1]] + batch[
+        "answer_input_ids"
+    ][:-1]
+    batch["answer_attention_mask"] = [batch["answer_attention_mask"][-1]] + batch[
+        "answer_attention_mask"
+    ][:-1]
+    return batch
+
+
+def _tokenize(
+    batch: dict[str, list[Any]],
+    tokenizer: "PreTrainedTokenizer",
+) -> dict[str, list[Any]]:
+    """Tokenize a batch from a KTO specific dataset."""
+    prompt_tokenized = tokenizer(batch["prompt"], add_special_tokens=False)
+    prompt_input_ids = prompt_tokenized["input_ids"]
+    prompt_attention_mask = prompt_tokenized["attention_mask"]
+    prompt_and_completion = [
+        prompt + completion
+        for prompt, completion in zip(batch["prompt"], batch["completion"], strict=True)
+    ]
+    full_tokenized = tokenizer(prompt_and_completion, add_special_tokens=False)
+    full_input_ids = full_tokenized["input_ids"]
+    full_attention_mask = full_tokenized["attention_mask"]
+
+    answer_input_ids = [
+        f[len(p) :] for f, p in zip(full_input_ids, prompt_input_ids, strict=True)
+    ]
+    answer_attention_mask = [
+        f[len(p) :]
+        for f, p in zip(full_attention_mask, prompt_attention_mask, strict=True)
+    ]
+
+    # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
+    full_concat_input_ids = [
+        np.concatenate([p, a])
+        for p, a in zip(prompt_input_ids, answer_input_ids, strict=True)
+    ]
+    # Prepare input tokens for token by token comparison
+    full_input_ids = [np.array(f) for f in full_input_ids]
+    for full, concat in zip(full_input_ids, full_concat_input_ids, strict=True):
+        if len(full) != len(concat):
+            raise ValueError(
+                "The elements in 'full_input_ids' and 'full_concat_input_ids' must have the same pairwise length."
+            )
+
+    # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
+    # can be merged together when tokenizing prompt+answer. This could result
+    # on the last token from the prompt being different when tokenized on its own
+    # vs when done as prompt+answer.
+    response_token_ids_start_idx = [len(p) for p in prompt_input_ids]
+
+    # If tokenized prompt is different than both prompt+answer, then it means the
+    # last token has changed due to merging.
+    for idx, (p, f, r) in enumerate(
+        zip(prompt_input_ids, full_input_ids, response_token_ids_start_idx, strict=True)
+    ):
+        if not np.array_equal(p, f[:r]):
+            response_token_ids_start_idx[idx] -= 1
+
+    prompt_input_ids = [
+        f[:r] for f, r in zip(full_input_ids, response_token_ids_start_idx, strict=True)
+    ]
+    prompt_attention_mask = [
+        f[:r]
+        for f, r in zip(full_attention_mask, response_token_ids_start_idx, strict=True)
+    ]
+
+    for p, m in zip(prompt_input_ids, prompt_attention_mask, strict=True):
+        if len(p) != len(m):
+            raise ValueError(
+                "Prompt input ids and attention mask should have the same length."
+            )
+
+    answer_input_ids = [
+        f[r:] for f, r in zip(full_input_ids, response_token_ids_start_idx, strict=True)
+    ]
+    answer_attention_mask = [
+        f[r:]
+        for f, r in zip(full_attention_mask, response_token_ids_start_idx, strict=True)
+    ]
+
+    output = dict(
+        prompt_input_ids=prompt_input_ids,
+        prompt_attention_mask=prompt_attention_mask,
+        answer_input_ids=answer_input_ids,
+        answer_attention_mask=answer_attention_mask,
+    )
+
+    return output
+
+
+def _process_tokens(
+    example: dict[str, Any], model: "PreTrainedModel" = None, **kwargs
+) -> dict:
+    """Process tokens of a KTO specific dataset.
+
+    At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
+    completion responses is/are too long. First we truncate the prompt; if we're still too long, we truncate the
+    completion.
+
+    We also create the labels for the completion responses, which are of length equal to the sum of the length of the
+    prompt and the completion response, with label_pad_token_id for the prompt tokens.
+    """
+    prompt = example["prompt"]
+    completion = example["completion"]
+
+    batch = {
+        f"{kwargs['prefix']}prompt": prompt,
+        f"{kwargs['prefix']}completion": completion,
+        f"{kwargs['prefix']}label": example["label"],
+    }
+
+    if not kwargs["is_encoder_decoder"]:
+        # Check issues below for more details
+        #  1. https://github.com/huggingface/trl/issues/907
+        #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+        #  3. https://github.com/LianjiaTech/BELLE/issues/337
+
+        if not isinstance(prompt, str):
+            raise ValueError(f"prompt should be an str but got {type(prompt)}")
+
+        if not isinstance(completion, str):
+            raise ValueError(f"completion should be an str but got {type(completion)}")
+
+        # keys of format prompt_* refers to just the prompt and answer_* refers to just the answer
+        all_tokens = {
+            "prompt_input_ids": example["prompt_input_ids"],
+            "prompt_attention_mask": example["prompt_attention_mask"],
+            "answer_input_ids": example["answer_input_ids"],
+            "answer_attention_mask": example["answer_attention_mask"],
+        }
+
+        # calculate max length by checking if BOS/EOS is already there
+        max_length = kwargs["max_length"]
+        bos_token_id = kwargs["tokenizer"].bos_token_id
+        eos_token_id = kwargs["tokenizer"].eos_token_id
+        if (
+            len(all_tokens["prompt_input_ids"]) > 0
+            and bos_token_id != all_tokens["prompt_input_ids"][0]
+        ):
+            max_length -= 1
+        if (
+            len(all_tokens["answer_input_ids"]) > 0
+            and eos_token_id != all_tokens["answer_input_ids"][-1]
+        ):
+            max_length -= 1
+
+        # if combined sequence is too long (> max_length - 1 for BOS token - 1 for EOS), truncate the prompt
+        if (
+            len(all_tokens["prompt_input_ids"]) + len(all_tokens["answer_input_ids"])
+            > max_length
+        ):
+            for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                if kwargs["truncation_mode"] == "keep_start":
+                    all_tokens[k] = all_tokens[k][: kwargs["max_prompt_length"]]
+                elif kwargs["truncation_mode"] == "keep_end":
+                    all_tokens[k] = all_tokens[k][-kwargs["max_prompt_length"] :]
+                else:
+                    raise ValueError(
+                        f"Unknown truncation mode: {kwargs['truncation_mode']}"
+                    )
+
+        # if that's still too long, truncate the response
+        if (
+            len(all_tokens["prompt_input_ids"]) + len(all_tokens["answer_input_ids"])
+            > max_length
+        ):
+            for k in ["answer_input_ids", "answer_attention_mask"]:
+                all_tokens[k] = all_tokens[k][
+                    : max_length - kwargs["max_prompt_length"]
+                ]
+
+        # all input_ids and attention mask as is. We then check if we need to add BOS/EOS tokens
+        batch[f"{kwargs['prefix']}prompt_input_ids"] = all_tokens["prompt_input_ids"]
+        batch[f"{kwargs['prefix']}prompt_attention_mask"] = all_tokens[
+            "prompt_attention_mask"
+        ]
+        batch[f"{kwargs['prefix']}completion_input_ids"] = (
+            all_tokens["prompt_input_ids"] + all_tokens["answer_input_ids"]
+        )
+        batch[f"{kwargs['prefix']}completion_attention_mask"] = (
+            all_tokens["prompt_attention_mask"] + all_tokens["answer_attention_mask"]
+        )
+
+        # add BOS, which affects both prompt and the full completion
+        if bos_token_id is not None:
+            if (
+                len(all_tokens["prompt_input_ids"]) == 0
+                or bos_token_id != all_tokens["prompt_input_ids"][0]
+            ):
+                batch[f"{kwargs['prefix']}prompt_input_ids"] = [bos_token_id] + batch[
+                    f"{kwargs['prefix']}prompt_input_ids"
+                ]
+                batch[f"{kwargs['prefix']}prompt_attention_mask"] = [1] + batch[
+                    f"{kwargs['prefix']}prompt_attention_mask"
+                ]
+                batch[f"{kwargs['prefix']}completion_input_ids"] = [
+                    bos_token_id
+                ] + batch[f"{kwargs['prefix']}completion_input_ids"]
+                batch[f"{kwargs['prefix']}completion_attention_mask"] = [1] + batch[
+                    f"{kwargs['prefix']}completion_attention_mask"
+                ]
+        # add EOS, which affects only the full completion
+        if (
+            len(all_tokens["answer_input_ids"]) == 0
+            or eos_token_id != all_tokens["answer_input_ids"][-1]
+        ):
+            batch[f"{kwargs['prefix']}completion_input_ids"] = batch[
+                f"{kwargs['prefix']}completion_input_ids"
+            ] + [eos_token_id]
+            batch[f"{kwargs['prefix']}completion_attention_mask"] = batch[
+                f"{kwargs['prefix']}completion_attention_mask"
+            ] + [1]
+
+        batch[f"{kwargs['prefix']}completion_labels"] = batch[
+            f"{kwargs['prefix']}completion_input_ids"
+        ][:]
+        batch[f"{kwargs['prefix']}completion_labels"][
+            : len(batch[f"{kwargs['prefix']}prompt_input_ids"])
+        ] = [kwargs["label_pad_token_id"]] * len(
+            batch[f"{kwargs['prefix']}prompt_input_ids"]
+        )
+    else:
+        completion_tokens = kwargs["tokenizer"](
+            completion,
+            truncation=True,
+            max_length=kwargs["max_completion_length"],
+            add_special_tokens=True,
+        )
+        prompt_tokens = kwargs["tokenizer"](
+            prompt,
+            truncation=True,
+            max_length=kwargs["max_prompt_length"],
+            add_special_tokens=True,
+        )
+
+        batch[f"{kwargs['prefix']}prompt_input_ids"] = prompt_tokens["input_ids"]
+        batch[f"{kwargs['prefix']}prompt_attention_mask"] = prompt_tokens[
+            "attention_mask"
+        ]
+
+        batch[f"{kwargs['prefix']}completion_labels"] = completion_tokens["input_ids"]
+        batch[f"{kwargs['prefix']}completion_attention_mask"] = completion_tokens[
+            "attention_mask"
+        ]
+        if model is not None and hasattr(
+            model, "prepare_decoder_input_ids_from_labels"
+        ):
+            batch[f"{kwargs['prefix']}completion_decoder_input_ids"] = (
+                model.prepare_decoder_input_ids_from_labels(
+                    labels=torch.tensor(batch["completion_labels"])
+                )
+            )
+
+    return batch
+
+
+class KTOTrainer(BaseTrainer):
+    r"""
+    Initialize KTOTrainer.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an [`~transformers.AutoModelForSequenceClassification`].
+        ref_model ([`PreTrainedModelWrapper`]):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        args ([`KTOConfig`]):
+            The arguments to use for training.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        model_adapter_name (`str`, defaults to `None`):
+            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+        ref_adapter_name (`str`, defaults to `None`):
+            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+    """
+
+    _tag_names = ["trl", "kto"]
+    _name = "KTO"
+    _paper = {
+        "title": "KTO: Model Alignment as Prospect Theoretic Optimization",
+        "id": "2402.01306",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{ethayarajh2024kto,
+                title        = {{KTO: Model Alignment as Prospect Theoretic Optimization}},
+                author       = {Kawin Ethayarajh and Winnie Xu and Niklas Muennighoff and Dan Jurafsky and Douwe Kiela},
+                year         = 2024,
+                eprint       = {arXiv:2402.01306},
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str = None,
+        ref_model: PreTrainedModel | nn.Module | str | None = None,
+        args: KTOConfig = None,
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        data_collator: DataCollator | None = None,
+        model_init: Callable[[], PreTrainedModel] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
+        model_adapter_name: str | None = None,
+        ref_adapter_name: str | None = None,
+    ):
+        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+            warnings.warn(
+                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
+                "it and want it to remain, please share your comments here: "
+                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
+                "TRL_EXPERIMENTAL_SILENCE=1."
+            )
+        if type(args) is TrainingArguments:
+            raise ValueError("Please use `KTOConfig` instead TrainingArguments.")
+
+        if not isinstance(model, str) and ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must mass a copy of it, or `None` if you use peft."
+            )
+
+        if args.model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError(
+                "You passed model_kwargs to the KTOTrainer. But your model is already instantiated."
+            )
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            dtype = model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the KTOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                model_init_kwargs["dtype"] = dtype
+
+        if args.ref_model_init_kwargs is None:
+            ref_model_init_kwargs = {}
+        elif not isinstance(ref_model, str):
+            raise ValueError(
+                "You passed ref_model_kwargs to the KTOTrainer. But your ref_model is already instantiated."
+            )
+        else:
+            ref_model_init_kwargs = args.ref_model_init_kwargs
+            dtype = ref_model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the KTOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                ref_model_init_kwargs["dtype"] = dtype
+
+        if isinstance(model, str):
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        if isinstance(ref_model, str):
+            ref_model = AutoModelForCausalLM.from_pretrained(
+                ref_model, **ref_model_init_kwargs
+            )
+
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it with `pip install peft` to use the PEFT models"
+            )
+        if is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(
+                model, "is_loaded_in_4bit", False
+            ):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {
+                    "use_gradient_checkpointing": args.gradient_checkpointing
+                }
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = (
+                        args.gradient_checkpointing_kwargs
+                    )
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+
+                    model.get_input_embeddings().register_forward_hook(
+                        make_inputs_require_grad
+                    )
+
+            # get peft model with the given config
+            model = get_peft_model(model, peft_config)
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(
+                    make_inputs_require_grad
+                )
+
+        if args.generate_during_eval and not (
+            is_wandb_available() or is_comet_available()
+        ):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif args.is_encoder_decoder is None:
+            raise ValueError(
+                "When no model is provided, you need to pass the parameter is_encoder_decoder."
+            )
+        else:
+            self.is_encoder_decoder = args.is_encoder_decoder
+
+        self.is_peft_model = is_peft_available() and isinstance(model, PeftModel)
+        self.model_adapter_name = model_adapter_name
+        self.ref_adapter_name = ref_adapter_name
+
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model or args.precompute_ref_log_probs:
+            # The `model` with adapters turned off will be used as the reference model
+            self.ref_model = None
+        else:
+            self.ref_model = create_reference_model(model)
+
+        if processing_class is None:
+            raise ValueError(
+                "max_length or a processing_class must be specified when using the default DPODataCollatorWithPadding"
+            )
+        if args.max_length is None:
+            logger.warning(
+                "When using DPODataCollatorWithPadding, you should set `max_length` in the KTOTrainer's init"
+                " it will be set to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        if args.max_length is not None:
+            max_length = args.max_length
+
+        if args.max_prompt_length is None:
+            logger.warning(
+                "When using DPODataCollatorWithPadding, you should set `max_prompt_length` in the KTOTrainer's init"
+                " it will be set to `128` by default, but you should do it yourself in the future.",
+            )
+            max_prompt_length = 128
+        if args.max_prompt_length is not None:
+            max_prompt_length = args.max_prompt_length
+
+        max_completion_length = None
+        if args.max_completion_length is None and self.is_encoder_decoder:
+            logger.warning(
+                "When using DPODataCollatorWithPadding with an encoder decoder architecture, you should set `max_completion_length` in the KTOTrainer's init"
+                " it will be set to `128` by default, but you should do it yourself in the future.",
+            )
+            max_completion_length = 128
+        if args.max_completion_length is not None and self.is_encoder_decoder:
+            max_completion_length = args.max_completion_length
+
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+                label_pad_token_id=args.label_pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your KTOConfig"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        self.loss_type = args.loss_type
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.padding_value = (
+            args.padding_value
+            if args.padding_value is not None
+            else processing_class.pad_token_id
+        )
+        self.max_prompt_length = max_prompt_length
+        self.truncation_mode = args.truncation_mode
+        self.max_completion_length = max_completion_length
+        self.processing_class = processing_class
+        self.precompute_ref_log_probs = args.precompute_ref_log_probs
+
+        # Not all losses require a KL calculation
+        self.calculate_KL = True
+        if self.loss_type in ["apo_zero_unpaired"]:
+            self.calculate_KL = False
+
+        # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
+        # keep track of first called to avoid computation of future calls
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+
+        # metric
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # KTO parameter
+        self.beta = args.beta
+        self.desirable_weight = args.desirable_weight
+        self.undesirable_weight = args.undesirable_weight
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in KTO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids" and "completion_input_ids". As a result,
+        # the trainer issues the warning: "Could not estimate the number of tokens of the input, floating-point
+        # operations will not be computed." To suppress this warning, we set the "estimate_tokens" key in the model's
+        # "warnings_issued" dictionary to True. This acts as a flag to indicate that the warning has already been
+        # issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Compute that only on the main process for faster data processing.
+        # see: https://github.com/huggingface/trl/pull/1255
+        with PartialState().main_process_first():
+            # Extract the prompt if needed
+            train_dataset = train_dataset.map(
+                maybe_extract_prompt,
+                num_proc=args.dataset_num_proc,
+                desc="Extracting prompt from train dataset",
+            )
+            # Unpair the dataset if needed
+            train_dataset = maybe_unpair_preference_dataset(
+                train_dataset, args.dataset_num_proc, desc="Unpairing train dataset"
+            )
+            # Apply the chat template if needed
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template,
+                fn_kwargs={"tokenizer": processing_class},
+                num_proc=args.dataset_num_proc,
+                desc="Applying chat template to train dataset",
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    maybe_extract_prompt,
+                    num_proc=args.dataset_num_proc,
+                    desc="Extracting prompt from eval dataset",
+                )
+                eval_dataset = maybe_unpair_preference_dataset(
+                    eval_dataset, args.dataset_num_proc, desc="Unpairing eval dataset"
+                )
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                    desc="Applying chat template to eval dataset",
+                )
+
+            # Tokenize and prepare the training datasets
+            train_dataset = train_dataset.map(
+                _tokenize,
+                batched=True,
+                fn_kwargs={"tokenizer": self.processing_class},
+                num_proc=args.dataset_num_proc,
+                desc="Tokenizing train dataset",
+            )
+
+            fn_kwargs = {
+                "prefix": "",
+                "is_encoder_decoder": self.is_encoder_decoder,
+                "tokenizer": self.processing_class,
+                "max_length": self.max_length,
+                "truncation_mode": self.truncation_mode,
+                "label_pad_token_id": self.label_pad_token_id,
+                "max_prompt_length": self.max_prompt_length,
+                "max_completion_length": self.max_completion_length,
+            }
+
+            train_dataset = train_dataset.map(
+                _process_tokens,
+                fn_kwargs=fn_kwargs,
+                num_proc=args.dataset_num_proc,
+                desc="Processing tokenized train dataset",
+            )
+
+            # Tokenize and prepare the eval datasets
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    _tokenize,
+                    fn_kwargs={"tokenizer": self.processing_class},
+                    batched=True,
+                    num_proc=args.dataset_num_proc,
+                    desc="Tokenizing eval dataset",
+                )
+
+                eval_dataset = eval_dataset.map(
+                    _process_tokens,
+                    fn_kwargs=fn_kwargs,
+                    num_proc=args.dataset_num_proc,
+                    desc="Processing tokenized eval dataset",
+                )
+
+            # Get KL datasets if needed
+            if self.calculate_KL:
+                if args.per_device_train_batch_size <= 1:
+                    raise ValueError(
+                        "Actual (not effective) batch size must be > 1. KTO will not work properly because the KL term will be equivalent to the implied reward."
+                    )
+
+                # create pairs for estimating the KL term by flipping the matched pairs in each batch of size total_batch_size
+                # i.e., (x_1, y_1), ..., (x_n, y_n) --> (x_1, y_n), ..., (x_n, y_1) = (x'_1, y'_1), ..., (x'_n, y'_n)
+                train_kl_dataset = train_dataset.map(
+                    _get_kl_dataset,
+                    batched=True,
+                    batch_size=args.per_device_train_batch_size,
+                    num_proc=args.dataset_num_proc,
+                    desc="Extracting KL train dataset",
+                )
+
+                fn_kwargs["prefix"] = "KL_"
+                train_kl_dataset = train_kl_dataset.map(
+                    _process_tokens,
+                    fn_kwargs=fn_kwargs,
+                    num_proc=args.dataset_num_proc,
+                    remove_columns=[
+                        c
+                        for c in train_kl_dataset.column_names
+                        if c in train_dataset.column_names
+                    ],
+                    desc="Processing tokenized train KL dataset",
+                )
+
+                # merge the datasets
+                train_dataset = concatenate_datasets(
+                    [train_dataset, train_kl_dataset], axis=1
+                )
+
+                if eval_dataset is not None:
+                    # Get KL dataset
+                    eval_kl_dataset = eval_dataset.map(
+                        _get_kl_dataset,
+                        batched=True,
+                        batch_size=args.per_device_train_batch_size,
+                        num_proc=args.dataset_num_proc,
+                        desc="Extracting eval KL dataset",
+                    )
+
+                    eval_kl_dataset = eval_kl_dataset.map(
+                        _process_tokens,
+                        fn_kwargs=fn_kwargs,
+                        num_proc=args.dataset_num_proc,
+                        remove_columns=[
+                            c
+                            for c in eval_kl_dataset.column_names
+                            if c in eval_dataset.column_names
+                        ],
+                        desc="Processing tokenized eval KL dataset",
+                    )
+
+                    # merge the datasets
+                    eval_dataset = concatenate_datasets(
+                        [eval_dataset, eval_kl_dataset], axis=1
+                    )
+
+            # calculate dataset desirability balance
+            num_desirable = max(sum(train_dataset["label"]), 1)
+            num_undesirable = max(
+                len(train_dataset["label"]) - num_desirable, 1
+            )  # "label" is binary
+
+            if num_desirable != num_undesirable:
+                # The lower and upper bounds come from Eq. (8) of https://huggingface.co/papers/2402.01306
+                des_weight_lower_bound = round(
+                    (num_undesirable * self.undesirable_weight / num_desirable) * 1, 2
+                )
+                des_weight_upper_bound = round(
+                    (num_undesirable * self.undesirable_weight / num_desirable) * 1.33,
+                    2,
+                )
+                und_weight_lower_bound = round(
+                    (num_desirable * self.desirable_weight / num_undesirable) / 1.33, 2
+                )
+                und_weight_upper_bound = round(
+                    (num_desirable * self.desirable_weight / num_undesirable) / 1, 2
+                )
+
+                des_weight_in_range = (
+                    des_weight_lower_bound
+                    <= self.desirable_weight
+                    <= des_weight_upper_bound
+                )
+                und_weight_in_range = (
+                    und_weight_lower_bound
+                    <= self.undesirable_weight
+                    <= und_weight_upper_bound
+                )
+
+                if not (des_weight_in_range or und_weight_in_range):
+                    logger.warning(
+                        "You have different amounts of desirable/positive and undesirable/negative examples but the "
+                        "weights on the desirable and undesirable losses don't seem to be in an ideal range. Based "
+                        f"on your data, we recommend EITHER "
+                        f"desirable_weight in [{des_weight_lower_bound}, {des_weight_upper_bound}] or "
+                        f"undesirable_weight in [{und_weight_lower_bound}, {und_weight_upper_bound}] (but NOT BOTH). "
+                        "See the documentation on how to optimally set these weights.",
+                    )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+        # Deepspeed Zero-3 does not support precompute_ref_log_probs
+        if self.is_deepspeed_enabled:
+            if (
+                self.accelerator.state.deepspeed_plugin.zero_stage == 3
+                and self.precompute_ref_log_probs
+            ):
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`."
+                )
+
+        if self.ref_model is None:
+            if not (self.is_peft_model or self.precompute_ref_log_probs):
+                raise ValueError(
+                    "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`"
+                )
+        elif self.is_deepspeed_enabled:
+            self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+        else:
+            self.ref_model = self.accelerator.prepare_model(
+                self.ref_model, evaluation_mode=True
+            )
+
+        # Import Liger kernel if enabled
+        if self.args.use_liger_kernel:
+            if not is_liger_kernel_available():
+                raise ImportError(
+                    "You set `use_liger_kernel=True` but the liger kernel is not available. "
+                    "Please install liger-kernel first: `pip install liger-kernel`"
+                )
+            if self.loss_type in ["apo_zero_unpaired"]:
+                raise ValueError(
+                    "You cannot set `loss_type='apo_zero_unpaired'` with liger-kernel."
+                    "Only KTO loss is supported with liger-kernel."
+                )
+            if self.precompute_ref_log_probs:
+                raise ValueError(
+                    "You cannot use `precompute_ref_log_probs=True` with liger kernel. Please set "
+                    "`precompute_ref_log_probs=False`."
+                )
+            if self.is_peft_model or self.ref_adapter_name is not None:
+                raise ValueError(
+                    "You cannot use `use_liger_kernel=True` with Peft models. Please set `use_liger_kernel=False`."
+                )
+            self.kto_loss_fn = LigerFusedLinearKTOLoss(
+                ignore_index=self.label_pad_token_id,
+                beta=self.beta,
+                use_ref_model=(self.ref_model is not None),
+            )
+
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with (
+            self.accelerator.unwrap_model(self.model).disable_adapter()
+            if self.is_peft_model and not self.ref_adapter_name
+            else nullcontext()
+        ):
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.set_adapter(self.model_adapter_name or "default")
+
+    def get_train_dataloader(self) -> DataLoader:
+        """
+        Returns the training [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`.
+        """
+        if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_train_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(
+                DataLoader(self.train_dataset, **dataloader_params)
+            )
+            reference_completion_logps = []
+            reference_KL_logps = []
+
+            for padded_batch in tqdm(
+                iterable=data_loader, desc="Train dataset reference log probs"
+            ):
+                reference_completion_logp, reference_KL_logp = (
+                    self.compute_reference_log_probs(padded_batch)
+                )
+
+                reference_completion_logp = self.accelerator.gather_for_metrics(
+                    reference_completion_logp
+                )
+                reference_completion_logps.append(reference_completion_logp.cpu())
+
+                if self.calculate_KL:
+                    reference_KL_logp = self.accelerator.gather_for_metrics(
+                        reference_KL_logp
+                    )
+                    reference_KL_logps.append(reference_KL_logp.cpu())
+
+            self.train_dataset = self.train_dataset.add_column(
+                name="reference_logps",
+                column=torch.cat(reference_completion_logps).float().numpy(),
+            )
+
+            if self.calculate_KL:
+                self.train_dataset = self.train_dataset.add_column(
+                    name="reference_KL_logps",
+                    column=torch.cat(reference_KL_logps).float().numpy(),
+                )
+
+            self._precomputed_train_ref_log_probs = True
+
+        return super().get_train_dataloader()
+
+    def get_eval_dataloader(self, eval_dataset: Dataset | None = None) -> DataLoader:
+        """
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
+
+        Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`.
+
+        Args:
+            eval_dataset (`torch.utils.data.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
+                by the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
+        if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs:
+            dataloader_params = {
+                "batch_size": self.args.per_device_eval_batch_size,
+                "collate_fn": self.data_collator,
+                "num_workers": self.args.dataloader_num_workers,
+                "pin_memory": self.args.dataloader_pin_memory,
+                "shuffle": False,
+            }
+
+            # prepare dataloader
+            data_loader = self.accelerator.prepare(
+                DataLoader(eval_dataset, **dataloader_params)
+            )
+
+            reference_completion_logps = []
+            reference_KL_logps = []
+
+            for padded_batch in tqdm(
+                iterable=data_loader, desc="Eval dataset reference log probs"
+            ):
+                reference_completion_logp, reference_KL_logp = (
+                    self.compute_reference_log_probs(padded_batch)
+                )
+
+                reference_completion_logp = self.accelerator.gather_for_metrics(
+                    reference_completion_logp
+                )
+                reference_completion_logps.append(reference_completion_logp.cpu())
+
+                if self.calculate_KL:
+                    reference_KL_logp = self.accelerator.gather_for_metrics(
+                        reference_KL_logp
+                    )
+                    reference_KL_logps.append(reference_KL_logp.cpu())
+
+            eval_dataset = eval_dataset.add_column(
+                name="reference_logps",
+                column=torch.cat(reference_completion_logps).float().numpy(),
+            )
+            if self.calculate_KL:
+                eval_dataset = eval_dataset.add_column(
+                    name="reference_KL_logps",
+                    column=torch.cat(reference_KL_logps).float().numpy(),
+                )
+
+            # Save calculated reference_chosen_logps and reference_rejected_logps to the eval_dataset for subsequent runs
+            if self.eval_dataset is not None:
+                self.eval_dataset = eval_dataset
+            self._precomputed_eval_ref_log_probs = True
+
+        return super().get_eval_dataloader(eval_dataset=eval_dataset)
+
+    def compute_reference_log_probs(self, padded_batch: dict) -> dict:
+        """Computes log probabilities of the reference model for a single padded batch of a KTO specific dataset."""
+        with torch.no_grad():
+            if self.ref_model is None:
+                with self.null_ref_context():
+                    if self.is_encoder_decoder:
+                        completion_logits = self.model(
+                            padded_batch["prompt_input_ids"],
+                            attention_mask=padded_batch["prompt_attention_mask"],
+                            decoder_input_ids=padded_batch.get(
+                                "completion_decoder_input_ids"
+                            ),
+                            labels=padded_batch["completion_labels"],
+                        ).logits
+
+                        if self.calculate_KL:
+                            KL_logits = self.model(
+                                padded_batch["KL_prompt_input_ids"],
+                                attention_mask=padded_batch["KL_prompt_attention_mask"],
+                                decoder_input_ids=padded_batch.get(
+                                    "KL_completion_decoder_input_ids"
+                                ),
+                                labels=padded_batch["KL_completion_labels"],
+                            ).logits
+                    else:
+                        completion_logits = self.model(
+                            padded_batch["completion_input_ids"],
+                            attention_mask=padded_batch["completion_attention_mask"],
+                        ).logits
+
+                        if self.calculate_KL:
+                            KL_logits = self.model(
+                                padded_batch["KL_completion_input_ids"],
+                                attention_mask=padded_batch[
+                                    "KL_completion_attention_mask"
+                                ],
+                            ).logits
+            elif self.is_encoder_decoder:
+                completion_logits = self.ref_model(
+                    padded_batch["prompt_input_ids"],
+                    attention_mask=padded_batch["prompt_attention_mask"],
+                    decoder_input_ids=padded_batch.get("completion_decoder_input_ids"),
+                    labels=padded_batch["completion_labels"],
+                ).logits
+
+                if self.calculate_KL:
+                    KL_logits = self.ref_model(
+                        padded_batch["KL_prompt_input_ids"],
+                        attention_mask=padded_batch["KL_prompt_attention_mask"],
+                        decoder_input_ids=padded_batch.get(
+                            "KL_completion_decoder_input_ids"
+                        ),
+                        labels=padded_batch["KL_completion_labels"],
+                    ).logits
+            else:
+                completion_logits = self.ref_model(
+                    padded_batch["completion_input_ids"],
+                    attention_mask=padded_batch["completion_attention_mask"],
+                ).logits
+
+                if self.calculate_KL:
+                    KL_logits = self.ref_model(
+                        padded_batch["KL_completion_input_ids"],
+                        attention_mask=padded_batch["KL_completion_attention_mask"],
+                    ).logits
+
+        completion_logps = self.get_batch_logps(
+            completion_logits,
+            padded_batch["completion_labels"],
+            average_log_prob=False,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        if self.calculate_KL:
+            KL_logps = self.get_batch_logps(
+                KL_logits,
+                padded_batch["KL_completion_labels"],
+                average_log_prob=False,
+                is_encoder_decoder=self.is_encoder_decoder,
+                label_pad_token_id=self.label_pad_token_id,
+            )
+        else:
+            KL_logps = None
+
+        return completion_logps, KL_logps
+
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        label_pad_token_id: int = -100,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits:
+                Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
+                ignored. Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+            label_pad_token_id:
+                The label value to ignore when computing log probabilities.
+            is_encoder_decoder:
+                Whether the model is an encoder-decoder model. If True, the labels are not shifted and the logits are
+                assumed to already be aligned with the labels. If False, the labels are shifted to the right by one
+                position, and the logits are assumed to be aligned with the shifted labels.
+
+        Returns
+        -------
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError(
+                "Logits (batch and sequence length dim) and labels must have the same shape."
+            )
+
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        else:
+            # Fixes end-dec RuntimeError
+            labels = labels.clone()
+
+        loss_mask = labels != label_pad_token_id
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == label_pad_token_id] = 0
+
+        per_token_logps = selective_log_softmax(logits, labels)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        return (per_token_logps * loss_mask).sum(-1)
+
+    def forward(
+        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
+    ) -> tuple[
+        torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor
+    ]:
+        KL_logps = self._compute_kl_logps(model, batch)
+
+        model_kwargs = (
+            {
+                "labels": batch["completion_labels"],
+                "decoder_input_ids": batch.get("completion_decoder_input_ids"),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        outputs = model(
+            batch["completion_input_ids"],
+            attention_mask=batch["completion_attention_mask"],
+            **model_kwargs,
+        )
+        completion_logits = outputs.logits
+
+        completion_logps = self.get_batch_logps(
+            completion_logits,
+            batch["completion_labels"],
+            average_log_prob=False,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        if completion_logps.shape[0] != len(batch["label"]):
+            raise ValueError(
+                "There is a mismatch between the number of examples in this batch and the number of "
+                "examples for which an output sequence was predicted."
+            )
+
+        chosen_idx = [
+            i for i in range(completion_logps.shape[0]) if batch["label"][i] is True
+        ]
+        rejected_idx = [
+            i for i in range(completion_logps.shape[0]) if batch["label"][i] is False
+        ]
+
+        chosen_logps = completion_logps[chosen_idx, ...]
+        rejected_logps = completion_logps[rejected_idx, ...]
+
+        chosen_logits = completion_logits[chosen_idx, ...]
+        rejected_logits = completion_logits[rejected_idx, ...]
+
+        if self.aux_loss_enabled:
+            return (
+                chosen_logps,
+                rejected_logps,
+                chosen_logits,
+                rejected_logits,
+                KL_logps,
+                outputs.aux_loss,
+            )
+        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, KL_logps)
+
+    def kto_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        policy_KL_logps: torch.FloatTensor,
+        reference_chosen_logps: torch.FloatTensor,
+        reference_rejected_logps: torch.FloatTensor,
+        reference_KL_logps: torch.FloatTensor,
+    ) -> tuple[
+        torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor
+    ]:
+        """Compute the KTO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (num(chosen) in batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (num(rejected) in batch_size,)
+            policy_KL_logps: Log probabilities of the policy model for the KL responses. Shape: (batch_size,)
+            reference_chosen_logps:
+                Log probabilities of the reference model for the chosen responses. Shape: (num(chosen) in batch_size,)
+            reference_rejected_logps:
+                Log probabilities of the reference model for the rejected responses. Shape: (num(rejected) in
+                batch_size,)
+            reference_KL_logps: Log probabilities of the reference model for the KL responses. Shape: (batch_size,)
+
+        Returns
+        -------
+            A tuple of four tensors: (losses, chosen_rewards, rejected_rewards, KL). The losses tensor contains the KTO
+            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
+            the chosen and rejected responses, respectively. The KL tensor contains the detached KL divergence estimate
+            between the policy and reference models.
+        """
+        if self.calculate_KL:
+            kl = (policy_KL_logps - reference_KL_logps).mean().detach()
+            kl = self.accelerator.gather_for_metrics(kl).mean().clamp(min=0)
+        else:
+            kl = torch.zeros(1).to(policy_chosen_logps.device)
+
+        # Chosen losses
+        if policy_chosen_logps.shape[0] != 0 or reference_chosen_logps.shape[0] != 0:
+            chosen_logratios = policy_chosen_logps - reference_chosen_logps
+
+            if self.loss_type == "kto":
+                # Eqn (7) of the KTO paper (https://huggingface.co/papers/2402.01306)
+                chosen_losses = 1 - F.sigmoid(self.beta * (chosen_logratios - kl))
+            elif self.loss_type == "apo_zero_unpaired":
+                # Unpaired variant of Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
+                # Use this loss when you believe the chosen outputs are better than your model's default output
+                chosen_losses = 1 - F.sigmoid(self.beta * chosen_logratios)
+
+            chosen_rewards = self.beta * chosen_logratios.detach()
+
+        else:
+            # lists can't be empty -- if they are, then accelerate.gather will hang
+            chosen_losses = torch.Tensor([]).to(self.accelerator.device)
+            chosen_rewards = torch.Tensor([]).to(self.accelerator.device)
+
+        # Rejected losses
+        if (
+            policy_rejected_logps.shape[0] != 0
+            or reference_rejected_logps.shape[0] != 0
+        ):
+            rejected_logratios = policy_rejected_logps - reference_rejected_logps
+
+            if self.loss_type == "kto":
+                rejected_losses = 1 - F.sigmoid(self.beta * (kl - rejected_logratios))
+            elif self.loss_type == "apo_zero_unpaired":
+                rejected_losses = F.sigmoid(self.beta * rejected_logratios)
+
+            rejected_rewards = self.beta * rejected_logratios.detach()
+        else:
+            # lists can't be empty -- if they are, then accelerate.gather will hang
+            rejected_losses = torch.Tensor([]).to(self.accelerator.device)
+            rejected_rewards = torch.Tensor([]).to(self.accelerator.device)
+
+        losses = torch.cat(
+            (
+                self.desirable_weight * chosen_losses,
+                self.undesirable_weight * rejected_losses,
+            ),
+            0,
+        )
+
+        return losses, chosen_rewards, rejected_rewards, kl
+
+    def _compute_kl_logps(self, model, batch):
+        """Compute KL log probabilities for a given batch."""
+        KL_logps = None
+        if self.calculate_KL:
+            if self.is_encoder_decoder:
+                KL_model_kwargs = {
+                    "input_ids": batch["KL_prompt_input_ids"],
+                    "attention_mask": batch["KL_prompt_attention_mask"],
+                    "labels": batch["KL_completion_labels"],
+                    "decoder_input_ids": batch.get("KL_completion_decoder_input_ids"),
+                }
+            else:
+                KL_model_kwargs = {
+                    "input_ids": batch["KL_completion_input_ids"],
+                    "attention_mask": batch["KL_completion_attention_mask"],
+                }
+
+            with torch.no_grad():
+                KL_logits = model(**KL_model_kwargs).logits
+
+            KL_logps = self.get_batch_logps(
+                KL_logits,
+                batch["KL_completion_labels"],
+                average_log_prob=False,
+                is_encoder_decoder=self.is_encoder_decoder,
+                label_pad_token_id=self.label_pad_token_id,
+            )
+        return KL_logps
+
+    def _compute_loss_liger(self, model, batch):
+        """
+        Compute the KTO loss using the Liger-Kernel's LigerFusedLinearKTOLoss.
+
+        Args:
+            model:
+                The policy model used for generating log probabilities and outputs. It could be an encoder-decoder
+                model or a regular language model.
+            batch: A dictionary containing the input data and labels for the batch.
+
+        Returns
+        -------
+            A dictionary containing the following keys:
+                - "loss": The computed KTO loss for the batch.
+                - "chosen_logits_sum": Sum of the logits for the chosen responses from the policy model.
+                - "rejected_logits_sum": Sum of the logits for the rejected responses from the policy model.
+                - "chosen_logps": Log probabilities of the chosen responses from the policy model.
+                - "rejected_logps": Log probabilities of the rejected responses from the policy model.
+                - "chosen_rewards": Rewards for the chosen responses.
+                - "rejected_rewards": Rewards for the rejected responses.
+                - "kl": The KL divergence between the policy and reference models (detached).
+
+            If auxiliary loss is enabled, the dictionary will also include:
+                - "aux_loss": The auxiliary loss from the model outputs.
+        """
+        policy_KL_logps = self._compute_kl_logps(model, batch)
+        reference_KL_logps = self._compute_kl_logps(self.ref_model, batch)
+        if self.calculate_KL:
+            kl = (policy_KL_logps - reference_KL_logps).mean().detach()
+            kl = self.accelerator.gather_for_metrics(kl).mean().clamp(min=0)
+        else:
+            kl = torch.zeros(1).to(self.accelerator.device)
+
+        model_kwargs = (
+            {
+                "labels": batch["completion_labels"],
+                "decoder_input_ids": batch.get("completion_decoder_input_ids"),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        if self.is_encoder_decoder:
+            # 1. Get encoder outputs
+            encoder_outputs = model.get_encoder()(
+                batch["completion_input_ids"],
+                attention_mask=batch["completion_attention_mask"],
+                return_dict=True,
+                **model_kwargs,
+            )
+            # 2. Get decoder outputs
+            outputs = model.get_decoder()(
+                input_ids=model_kwargs["decoder_input_ids"],
+                encoder_hidden_states=encoder_outputs.last_hidden_state,
+                use_cache=False,
+                **model_kwargs,
+            )
+            # 1. Get reference encoder outputs
+            ref_encoder_outputs = self.ref_model.get_encoder()(
+                batch["completion_input_ids"],
+                attention_mask=batch["completion_attention_mask"],
+                return_dict=True,
+                **model_kwargs,
+            )
+            # 2. Get reference decoder outputs
+            ref_outputs = self.ref_model.get_decoder()(
+                input_ids=model_kwargs["decoder_input_ids"],
+                encoder_hidden_states=ref_encoder_outputs.last_hidden_state,
+                use_cache=False,
+                **model_kwargs,
+            )
+        else:
+            # skip the lm head and get the last hidden state
+            if hasattr(model, "get_decoder") and model.get_decoder() is not None:
+                base_model = model.get_decoder()
+            else:
+                base_attr = getattr(
+                    model, "base_model_prefix", self.args.base_model_attribute_name
+                )
+                base_model = getattr(model, base_attr, model)
+            outputs = base_model(
+                batch["completion_input_ids"],
+                attention_mask=batch["completion_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+
+            # reference model
+            if (
+                hasattr(self.ref_model, "get_decoder")
+                and self.ref_model.get_decoder() is not None
+            ):
+                ref_base_model = self.ref_model.get_decoder()
+            else:
+                ref_attr = getattr(
+                    self.ref_model,
+                    "base_model_prefix",
+                    self.args.base_model_attribute_name,
+                )
+                ref_base_model = getattr(self.ref_model, ref_attr, self.ref_model)
+            ref_outputs = ref_base_model(
+                batch["completion_input_ids"],
+                attention_mask=batch["completion_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+        lm_head = model.get_output_embeddings()
+        ref_lm_head = self.ref_model.get_output_embeddings()
+
+        (
+            loss,
+            (
+                chosen_logps_sum,
+                rejected_logps_sum,
+                chosen_logits_sum,
+                rejected_logits_sum,
+                chosen_rewards_sum,
+                rejected_rewards_sum,
+            ),
+        ) = self.kto_loss_fn(
+            _input=outputs.last_hidden_state[:, :-1]
+            if not self.is_encoder_decoder
+            else outputs.last_hidden_state,
+            lin_weight=lm_head.weight,
+            target=batch["completion_labels"][:, 1:],
+            bias=lm_head.bias if hasattr(lm_head, "bias") else None,
+            preference_labels=torch.tensor(batch["label"], dtype=torch.bool).to(
+                self.accelerator.device
+            ),
+            ref_input=ref_outputs.last_hidden_state[:, :-1]
+            if not self.is_encoder_decoder
+            else outputs.last_hidden_state,
+            ref_weight=ref_lm_head.weight,
+            ref_bias=ref_lm_head.bias if hasattr(lm_head, "bias") else None,
+            kl=kl,
+        )
+
+        output = {
+            "loss": loss,
+            "chosen_logits_sum": chosen_logits_sum,
+            "rejected_logits_sum": rejected_logits_sum,
+            "chosen_logps_sum": chosen_logps_sum,
+            "rejected_logps_sum": rejected_logps_sum,
+            "chosen_rewards_sum": chosen_rewards_sum,
+            "rejected_rewards_sum": rejected_rewards_sum,
+            "kl": kl,
+        }
+        if self.aux_loss_enabled:
+            output["aux_loss"] = outputs.aux_loss
+
+        return output
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, list | torch.LongTensor],
+    ):
+        """Compute the KTO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        batch = {
+            k: (v.to(self.accelerator.device) if isinstance(v, torch.Tensor) else v)
+            for k, v in batch.items()
+        }
+
+        labels = torch.tensor(batch["label"])
+        num_chosen = labels.sum().to(self.accelerator.device)
+        num_rejected = (len(labels) - num_chosen).to(self.accelerator.device)
+
+        if self.args.use_liger_kernel:
+            model_output = self._compute_loss_liger(model, batch)
+            losses = model_output["loss"]
+            policy_chosen_logits = model_output["chosen_logits_sum"]
+            policy_rejected_logits = model_output["rejected_logits_sum"]
+            policy_chosen_logps = model_output["chosen_logps_sum"]
+            policy_rejected_logps = model_output["rejected_logps_sum"]
+            chosen_rewards = model_output["chosen_rewards_sum"]
+            rejected_rewards = model_output["rejected_rewards_sum"]
+            kl = model_output["kl"]
+            if self.aux_loss_enabled:
+                aux_loss = model_output["aux_loss"]
+        else:
+            forward_output = self.forward(model, batch)
+            (
+                policy_chosen_logps,
+                policy_rejected_logps,
+                policy_chosen_logits,
+                policy_rejected_logits,
+                policy_KL_logps,
+            ) = forward_output[:5]
+            if self.aux_loss_enabled:
+                aux_loss = forward_output[5]
+
+            # if reference_logps in batch use them, otherwise use the reference model
+            if "reference_logps" in batch:
+                chosen_idx = [
+                    i
+                    for i in range(batch["reference_logps"].shape[0])
+                    if batch["label"][i] is True
+                ]
+                rejected_idx = [
+                    i
+                    for i in range(batch["reference_logps"].shape[0])
+                    if batch["label"][i] is False
+                ]
+
+                reference_chosen_logps = batch["reference_logps"][chosen_idx, ...]
+                reference_rejected_logps = batch["reference_logps"][rejected_idx, ...]
+                if self.calculate_KL:
+                    reference_KL_logps = batch["reference_KL_logps"]
+                else:
+                    reference_KL_logps = None
+            else:
+                with torch.no_grad():
+                    if self.ref_model is None:
+                        with self.null_ref_context():
+                            (
+                                reference_chosen_logps,
+                                reference_rejected_logps,
+                                _,
+                                _,
+                                reference_KL_logps,
+                            ) = self.forward(self.model, batch)[:5]
+                    else:
+                        (
+                            reference_chosen_logps,
+                            reference_rejected_logps,
+                            _,
+                            _,
+                            reference_KL_logps,
+                        ) = self.forward(self.ref_model, batch)[:5]
+
+            losses, chosen_rewards, rejected_rewards, kl = self.kto_loss(
+                policy_chosen_logps,
+                policy_rejected_logps,
+                policy_KL_logps,
+                reference_chosen_logps,
+                reference_rejected_logps,
+                reference_KL_logps,
+            )
+
+        metrics["kl"] = kl.item()
+
+        all_num_chosen = self.accelerator.gather_for_metrics(num_chosen).sum().item()
+        all_num_rejected = (
+            self.accelerator.gather_for_metrics(num_rejected).sum().item()
+        )
+
+        if all_num_chosen > 0:
+            metrics["rewards/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(chosen_rewards.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["logps/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(policy_chosen_logps.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["logits/chosen_sum"] = (
+                self.accelerator.gather_for_metrics(policy_chosen_logits.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["count/chosen"] = all_num_chosen
+
+        if all_num_rejected > 0:
+            metrics["rewards/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(rejected_rewards.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["logps/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(policy_rejected_logps.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["logits/rejected_sum"] = (
+                self.accelerator.gather_for_metrics(policy_rejected_logits.nansum())
+                .nansum()
+                .item()
+            )
+            metrics["count/rejected"] = all_num_rejected
+
+        loss = losses.nanmean()
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+    def compute_loss(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs)
+
+        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
+        loss = loss.to(self.args.device)
+        # force log the metrics
+        if self.accelerator.is_main_process:
+            self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+
+    def store_metrics(
+        self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train"
+    ) -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def _get_train_sampler(
+        self, dataset: Dataset | None = None
+    ) -> torch.utils.data.Sampler | None:
+        if dataset is None:
+            dataset = self.train_dataset
+        if dataset is None or not has_length(dataset):
+            return None
+        return SequentialSampler(dataset)
+
+    def generate_from_model_and_ref(
+        self, model, batch: dict[str, torch.LongTensor]
+    ) -> tuple[str, str]:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+
+            # if reference_output in batch use that otherwise use the reference model
+            if "reference_output" in batch:
+                reference_output = batch["reference_output"]
+            elif self.ref_model is None:
+                with self.null_ref_context():
+                    reference_output = self.model.generate(
+                        input_ids=batch["prompt_input_ids"],
+                        attention_mask=batch["prompt_attention_mask"],
+                        max_length=self.max_length,
+                        do_sample=True,
+                        pad_token_id=self.processing_class.pad_token_id,
+                    )
+            else:
+                reference_output = self.ref_model.generate(
+                    input_ids=batch["prompt_input_ids"],
+                    attention_mask=batch["prompt_attention_mask"],
+                    max_length=self.max_length,
+                    do_sample=True,
+                    pad_token_id=self.processing_class.pad_token_id,
+                )
+
+        policy_output = pad_to_length(
+            policy_output, self.max_length, self.processing_class.pad_token_id
+        )
+        policy_output_decoded = self.processing_class.batch_decode(
+            policy_output, skip_special_tokens=True
+        )
+
+        reference_output = pad_to_length(
+            reference_output, self.max_length, self.processing_class.pad_token_id
+        )
+        reference_output_decoded = self.processing_class.batch_decode(
+            reference_output, skip_special_tokens=True
+        )
+
+        return policy_output_decoded, reference_output_decoded
+
+    def prediction_step(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        prediction_loss_only: bool,
+        ignore_keys: list[str] | None = None,
+    ):
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(model, inputs)
+
+        # force log the metrics
+        if self.accelerator.is_main_process:
+            self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {}
+        if "logits/chosen_sum" in metrics:
+            logits_dict["eval_logits/chosen"] = metrics["logits/chosen_sum"]
+        if "logits/rejected_sum" in metrics:
+            logits_dict["eval_logits/rejected"] = metrics["logits/rejected_sum"]
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: bool | None = None,
+        ignore_keys: list[str] | None = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(
+                range(num_samples), k=self.args.eval_batch_size
+            )
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            target_labels = torch.tensor(
+                random_batch["label"], dtype=torch.bool, device=self.accelerator.device
+            )
+            target_indices = torch.where(~target_labels)[0]
+            target_batch = {
+                "prompt_input_ids": random_batch["prompt_input_ids"][target_indices],
+                "prompt_attention_mask": random_batch["prompt_attention_mask"][
+                    target_indices
+                ],
+                "prompt": itemgetter(*target_indices)(random_batch["prompt"]),
+            }
+            policy_output_decoded, ref_output_decoded = (
+                self.generate_from_model_and_ref(self.model, target_batch)
+            )
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy", "Ref Model"],
+                data=[
+                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
+                    for prompt, pol, ref in zip(
+                        target_batch["prompt"],
+                        policy_output_decoded,
+                        ref_output_decoded,
+                        strict=True,
+                    )
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader,
+            description,
+            prediction_loss_only,
+            ignore_keys,
+            metric_key_prefix,
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float`, *optional*):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # train metrics should have no prefix, eval should have 'eval_'
+        prefix = "eval_" if train_eval == "eval" else ""
+        # accumulate average metrics from sums and lengths
+        for split in ["chosen", "rejected"]:
+            if f"count/{split}" in self._stored_metrics[train_eval]:
+                count_sum = (
+                    torch.Tensor(self._stored_metrics[train_eval][f"count/{split}"])
+                    .sum()
+                    .item()
+                )
+                for metric in ["rewards", "logps", "logits"]:
+                    logs[f"{prefix}{metric}/{split}"] = (
+                        torch.Tensor(
+                            self._stored_metrics[train_eval][f"{metric}/{split}_sum"]
+                        )
+                        .sum()
+                        .item()
+                        / count_sum
+                    )
+                    # delete obsolete metric
+                    del self._stored_metrics[train_eval][f"{metric}/{split}_sum"]
+                del self._stored_metrics[train_eval][f"count/{split}"]
+        # calculate reward margin
+        if f"{prefix}rewards/chosen" in logs and f"{prefix}rewards/rejected" in logs:
+            logs[f"{prefix}rewards/margins"] = (
+                logs[f"{prefix}rewards/chosen"] - logs[f"{prefix}rewards/rejected"]
+            )
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[f"{prefix}{key}"] = torch.Tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/trainer/model_config.py b/src/aixpert/training/training/trl/trainer/model_config.py
new file mode 100644
index 0000000..4dfe665
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/model_config.py
@@ -0,0 +1,212 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ModelConfig:
+    """
+    Configuration class for the models.
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        model_name_or_path (`str`, *optional*):
+            Model checkpoint for weights initialization.
+        model_revision (`str`, *optional*, defaults to `"main"`):
+            Specific model version to use. It can be a branch name, a tag name, or a commit id.
+        dtype (`Literal["auto", "bfloat16", "float16", "float32"]`, *optional*):
+            Override the default `torch.dtype` and load the model under this dtype. Possible values are
+
+                - `"bfloat16"`: `torch.bfloat16`
+                - `"float16"`: `torch.float16`
+                - `"float32"`: `torch.float32`
+                - `"auto"`: Automatically derive the dtype from the model's weights.
+
+        trust_remote_code (`bool`, *optional*, defaults to `False`):
+            Whether to allow for custom models defined on the Hub in their own modeling files. This option should only
+            be set to `True` for repositories you trust and in which you have read the code, as it will execute code
+            present on the Hub on your local machine.
+        attn_implementation (`str`, *optional*):
+            Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in which case
+            you must install this manually by running `pip install flash-attn --no-build-isolation`.
+        use_peft (`bool`, *optional*, defaults to `False`):
+            Whether to use PEFT for training.
+        lora_r (`int`, *optional*, defaults to `16`):
+            LoRA R value.
+        lora_alpha (`int`, *optional*, defaults to `32`):
+            LoRA alpha.
+        lora_dropout (`float`, *optional*, defaults to `0.05`):
+            LoRA dropout.
+        lora_target_modules (`str | list[str]`, *optional*):
+            LoRA target modules.
+        lora_target_parameters (`str | list[str]`, *optional*):
+            List of target parameters for LoRA.
+        lora_modules_to_save (`list[str]`, *optional*):
+            Model layers to unfreeze & train.
+        lora_task_type (`str`, *optional*, defaults to `"CAUSAL_LM"`):
+            Task type to pass for LoRA (use `"SEQ_CLS"` for reward modeling).
+        use_rslora (`bool`, *optional*, defaults to `False`):
+            Whether to use Rank-Stabilized LoRA, which sets the adapter scaling factor to `lora_alpha/√r`, instead of
+            the original default value of `lora_alpha/r`.
+        use_dora (`bool`, *optional*, defaults to `False`):
+            Enable [Weight-Decomposed Low-Rank Adaptation (DoRA)](https://huggingface.co/papers/2402.09353). This
+            technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is
+            handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can
+            improve the performance of LoRA, especially at low ranks. Right now, DoRA only supports linear and Conv2D
+            layers. DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for
+            inference.
+        load_in_8bit (`bool`, *optional*, defaults to `False`):
+            Whether to use 8 bit precision for the base model. Works only with LoRA.
+        load_in_4bit (`bool`, *optional*, defaults to `False`):
+            Whether to use 4 bit precision for the base model. Works only with LoRA.
+        bnb_4bit_quant_type (`str`, *optional*, defaults to `"nf4"`):
+            Quantization type (`"fp4"` or `"nf4"`).
+        use_bnb_nested_quant (`bool`, *optional*, defaults to `False`):
+            Whether to use nested quantization.
+    """
+
+    model_name_or_path: str | None = field(
+        default=None,
+        metadata={"help": "Model checkpoint for weights initialization."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={
+            "help": "Specific model version to use. It can be a branch name, a tag name, or a commit id."
+        },
+    )
+    dtype: str | None = field(
+        default=None,
+        metadata={
+            "help": "Override the default `torch.dtype` and load the model under this dtype.",
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to allow for custom models defined on the Hub in their own modeling files. This option "
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            "execute code present on the Hub on your local machine."
+        },
+    )
+    attn_implementation: str | None = field(
+        default=None,
+        metadata={
+            "help": "Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in "
+            "which case you must install this manually by running `pip install flash-attn --no-build-isolation`."
+        },
+    )
+    use_peft: bool = field(
+        default=False,
+        metadata={"help": "Whether to use PEFT for training."},
+    )
+    lora_r: int = field(
+        default=16,
+        metadata={"help": "LoRA R value."},
+    )
+    lora_alpha: int = field(
+        default=32,
+        metadata={"help": "LoRA alpha."},
+    )
+    lora_dropout: float = field(
+        default=0.05,
+        metadata={"help": "LoRA dropout."},
+    )
+    lora_target_modules: list[str] | None = field(
+        default=None,
+        metadata={"help": "LoRA target modules."},
+    )
+    lora_target_parameters: list[str] | None = field(
+        default=None,
+        metadata={"help": "List of target parameters for LoRA."},
+    )
+    lora_modules_to_save: list[str] | None = field(
+        default=None,
+        metadata={"help": "Model layers to unfreeze & train."},
+    )
+    lora_task_type: str = field(
+        default="CAUSAL_LM",
+        metadata={
+            "help": "Task type to pass for LoRA (use 'SEQ_CLS' for reward modeling)."
+        },
+    )
+    use_rslora: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use Rank-Stabilized LoRA, which sets the adapter scaling factor to `lora_alpha/√r`, "
+            "instead of the original default value of `lora_alpha/r`."
+        },
+    )
+    use_dora: bool = field(
+        default=False,
+        metadata={
+            "help": "Enable Weight-Decomposed Low-Rank Adaptation (DoRA). This technique decomposes the updates of "
+            "the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the "
+            "magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, "
+            "especially at low ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a "
+            "bigger overhead than pure LoRA, so it is recommended to merge weights for inference."
+        },
+    )
+    load_in_8bit: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use 8 bit precision for the base model. Works only with LoRA."
+        },
+    )
+    load_in_4bit: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use 4 bit precision for the base model. Works only with LoRA."
+        },
+    )
+    bnb_4bit_quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization type.", "choices": ["fp4", "nf4"]},
+    )
+    use_bnb_nested_quant: bool = field(
+        default=False,
+        metadata={"help": "Whether to use nested quantization."},
+    )
+    # Deprecated params
+    torch_dtype: str | None = field(
+        default=None,
+        metadata={
+            "help": "Override the default `torch.dtype` and load the model under this dtype.",
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+
+    def __post_init__(self):
+        if self.load_in_8bit and self.load_in_4bit:
+            raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
+
+        if self.torch_dtype and not self.dtype:
+            warnings.warn(
+                "`torch_dtype` is deprecated and will be removed in version 0.27.0, please use `dtype` instead.",
+                FutureWarning,
+            )
+            self.dtype = self.torch_dtype
+
+        if (
+            hasattr(self.lora_target_modules, "__len__")
+            and len(self.lora_target_modules) == 1
+        ):
+            self.lora_target_modules = self.lora_target_modules[0]
diff --git a/src/aixpert/training/training/trl/trainer/nash_md_config.py b/src/aixpert/training/training/trl/trainer/nash_md_config.py
new file mode 100644
index 0000000..a46463c
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/nash_md_config.py
@@ -0,0 +1,47 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from .online_dpo_config import OnlineDPOConfig
+
+
+@dataclass
+class NashMDConfig(OnlineDPOConfig):
+    r"""
+    Configuration class for the [`NashMDTrainer`].
+
+    Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
+
+    Parameters
+    ----------
+        mixture_coef (`float` or `list[float]`, *optional*, defaults to `0.5`):
+            Logit mixture coefficient for the model and reference model. If a list of floats is provided then the
+            mixture coefficient is selected for each new epoch and the last coefficient is used for the rest of the
+            epochs.
+    """
+
+    mixture_coef: list[float] = field(
+        default_factory=lambda: [0.5],
+        metadata={
+            "help": "Logit mixture coefficient for the model and reference model. If a list of floats is provided "
+            "then the mixture coefficient is selected for each new epoch and the last coefficient is used for the "
+            "rest of the epochs."
+        },
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+        if hasattr(self.mixture_coef, "__len__") and len(self.mixture_coef) == 1:
+            self.mixture_coef = self.mixture_coef[0]
diff --git a/src/aixpert/training/training/trl/trainer/nash_md_trainer.py b/src/aixpert/training/training/trl/trainer/nash_md_trainer.py
new file mode 100644
index 0000000..d93c069
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/nash_md_trainer.py
@@ -0,0 +1,569 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import textwrap
+from collections.abc import Callable
+from typing import Any
+
+import jinja2
+import torch
+import torch.nn.functional as F
+from datasets import Dataset, IterableDataset
+from torch import nn
+from transformers import (
+    BaseImageProcessor,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainerCallback,
+)
+from transformers.trainer_utils import EvalPrediction
+from transformers.training_args import OptimizerNames
+from transformers.utils import is_peft_available
+
+from ..data_utils import is_conversational, maybe_apply_chat_template
+from ..models.modeling_base import GeometricMixtureWrapper
+from ..models.utils import unwrap_model_for_generation
+from .nash_md_config import NashMDConfig
+from .online_dpo_trainer import OnlineDPOTrainer
+from .utils import (
+    SIMPLE_CHAT_TEMPLATE,
+    empty_cache,
+    get_reward,
+    selective_log_softmax,
+    truncate_right,
+)
+
+
+if is_peft_available():
+    from peft import PeftModel
+
+
+class NashMDTrainer(OnlineDPOTrainer):
+    """
+    Trainer for the Nash-MD method.
+
+    It is implemented as a subclass of [`OnlineDPOTrainer`].
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an `AutoModelForCausalLM`.
+        ref_model ([`PreTrainedModelWrapper`]):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        reward_funcs ([`~transformers.PreTrainedModel`]):
+            The reward model to score completions with, preferably an
+            [`~transformers.AutoModelForSequenceClassification`].
+        judge ([`experimental.judges.BasePairwiseJudge`]):
+            The judge to use for pairwise comparison of model completions.
+        args ([`NashMDConfig`]):
+            The NashMD config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        peft_config (`dict`):
+            The peft config to use for training.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+    """
+
+    _tag_names = ["trl", "nash-md"]
+    _name = "Nash-MD"
+    _paper = {
+        "title": "Nash Learning from Human Feedback",
+        "id": "2312.00886",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{munos2024nash,
+                title        = {{Nash Learning from Human Feedback}},
+                author       = {R{\'{e}}mi Munos and Michal Valko and Daniele Calandriello and Mohammad Gheshlaghi Azar and Mark Rowland and Zhaohan Daniel Guo and Yunhao Tang and Matthieu Geist and Thomas Mesnard and C{\\^{o}}me Fiegel and Andrea Michi and Marco Selvi and Sertan Girgin and Nikola Momchev and Olivier Bachem and Daniel J. Mankowitz and Doina Precup and Bilal Piot},
+                year         = 2024,
+                booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
+                publisher    = {OpenReview.net},
+                url          = {https://openreview.net/forum?id=Y5AmNYiyCQ}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module = None,
+        ref_model: PreTrainedModel | nn.Module = None,
+        reward_funcs: PreTrainedModel | nn.Module | None = None,
+        judge=None,
+        args: NashMDConfig | None = None,
+        data_collator: Callable | None = None,
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+    ) -> None:
+        super().__init__(
+            model=model,
+            ref_model=ref_model,
+            reward_funcs=reward_funcs,
+            judge=judge,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            reward_processing_classes=processing_class,
+            peft_config=peft_config,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        self._mixture_coef = self.args.mixture_coef
+
+        # Overwrite the stats dictionary to include NashMD specific statistics
+        self.stats = {
+            # Remove "non_score_reward", "rlhf_reward", "scores_margin"
+            # Add "mixture_coef"
+            "loss/kl": [],
+            "objective/entropy": [],
+            "loss/score": [],
+            "rewards/probabilities": [],
+            "rewards/accuracies": [],
+            "rewards/margins": [],
+            "logps/chosen": [],
+            "logps/rejected": [],
+            "val/model_contain_eos_token": [],
+            "val/ref_contain_eos_token": [],
+            "beta": [],
+            "mixture_coef": [],
+        }
+        if self.reward_funcs is not None:
+            if len(self.reward_funcs) != 1:
+                raise ValueError(
+                    "NashMDTrainer only supports one reward function/model."
+                )
+            self.reward_funcs = self.reward_funcs[0]
+            self.stats["rewards/chosen"] = []
+            self.stats["rewards/rejected"] = []
+
+    @property
+    def mixture_coef(self):
+        if isinstance(self._mixture_coef, list):
+            epoch = self.state.epoch
+            return (
+                self._mixture_coef[epoch]
+                if epoch < len(self._mixture_coef)
+                else self._mixture_coef[-1]
+            )
+        return self._mixture_coef
+
+    def _generate_completions(self, model, prompts):
+        # Generate completions from the policy model.
+        with unwrap_model_for_generation(
+            model, self.accelerator
+        ) as unwrapped_policy_for_gen_ctx:
+            model_output = unwrapped_policy_for_gen_ctx.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        # Get the DDP/FSDP unwrapped version of the main model.
+        # This will be the policy model for GeometricMixtureWrapper (PEFT adapters active if PEFT is used).
+        policy_model_for_gmw = self.accelerator.unwrap_model(model)
+
+        # Determine the correct reference model for GeometricMixtureWrapper.
+        # This also needs to be DDP/FSDP unwrapped.
+        ref_model_for_gmw: torch.nn.Module
+        if self.ref_model is None:
+            # No explicit ref_model is provided.
+            # Use the base of the main `model` if it's a PEFT model.
+            # policy_model_for_gmw is already DDP-unwrapped.
+            if is_peft_available() and isinstance(policy_model_for_gmw, PeftModel):
+                ref_model_for_gmw = policy_model_for_gmw.get_base_model()
+            else:
+                # Not a PEFT model (or PEFT not available), or already a base model.
+                # Use the DDP-unwrapped policy model itself as the reference.
+                ref_model_for_gmw = policy_model_for_gmw
+        else:
+            # An explicit ref_model is provided. Unwrap it for DDP/FSDP.
+            ref_model_for_gmw = self.accelerator.unwrap_model(self.ref_model)
+
+        # Both models given to GeometricMixtureWrapper (policy_model_for_gmw and ref_model_for_gmw) are DDP-unwrapped.
+        with torch.no_grad():  # Ensure no_grad context for mixture model generation
+            mixture_model = GeometricMixtureWrapper(
+                model=policy_model_for_gmw,
+                ref_model=ref_model_for_gmw,
+                generation_config=self.generation_config,
+                mixture_coef=self.mixture_coef,
+                device=self.accelerator.device,
+            )
+
+            mixture_output = mixture_model.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+
+        return model_output, mixture_output
+
+    def _process_completions(self, model_output, mixture_output, prompts):
+        context_length = prompts["input_ids"].shape[1]
+
+        # Process model completions
+        model_completion_ids = model_output[:, context_length:]
+        model_completion_ids, model_completion_mask = truncate_right(
+            model_completion_ids,
+            self.processing_class.eos_token_id,
+            self.processing_class.pad_token_id,
+        )
+        model_data = {
+            "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
+            "attention_mask": torch.cat(
+                (prompts["attention_mask"], model_completion_mask), dim=1
+            ),
+            "raw": prompts["raw"],
+        }
+
+        # Process reference model completions
+        mixture_completion_ids = mixture_output[:, context_length:]
+        mixture_completion_ids, mixture_completion_mask = truncate_right(
+            mixture_completion_ids,
+            self.processing_class.eos_token_id,
+            self.processing_class.pad_token_id,
+        )
+        mixture_data = {
+            "input_ids": torch.cat(
+                (prompts["input_ids"], mixture_completion_ids), dim=1
+            ),
+            "attention_mask": torch.cat(
+                (prompts["attention_mask"], mixture_completion_mask), dim=1
+            ),
+            "raw": prompts["raw"],
+        }
+
+        return model_data, mixture_data
+
+    def _compute_rewards(self, model_data, mixture_data, context_length):
+        with torch.no_grad():
+            _, model_scores, _ = get_reward(
+                self.reward_funcs,
+                model_data["input_ids"],
+                self.processing_class.pad_token_id,
+                context_length,
+            )
+            _, mixture_scores, _ = get_reward(
+                self.reward_funcs,
+                mixture_data["input_ids"],
+                self.processing_class.pad_token_id,
+                context_length,
+            )
+
+        # Apply EOS penalty if needed
+        if self.args.missing_eos_penalty is not None:
+            model_contain_eos = torch.any(
+                model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1
+            )
+            mixture_contain_eos = torch.any(
+                mixture_data["input_ids"] == self.processing_class.eos_token_id, dim=-1
+            )
+            model_scores[~model_contain_eos] -= self.args.missing_eos_penalty
+            mixture_scores[~mixture_contain_eos] -= self.args.missing_eos_penalty
+
+        return model_scores, mixture_scores
+
+    def _compute_judge(self, model_data, mixture_data, context_length):
+        prompts = model_data["raw"]
+        model_data_completions = self.processing_class.batch_decode(
+            model_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        model_data_completions = [
+            completion.strip() for completion in model_data_completions
+        ]
+
+        mixture_data_completions = self.processing_class.batch_decode(
+            mixture_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        mixture_data_completions = [
+            completion.strip() for completion in mixture_data_completions
+        ]
+        if is_conversational({"prompt": prompts[0]}):
+            model_data_completions = [
+                [{"role": "assistant", "content": completion}]
+                for completion in model_data_completions
+            ]
+            environment = jinja2.Environment()
+            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+            prompts = [template.render(messages=message) for message in prompts]
+            model_data_completions = [
+                template.render(messages=completion)
+                for completion in model_data_completions
+            ]
+
+            mixture_data_completions = [
+                [{"role": "assistant", "content": completion}]
+                for completion in mixture_data_completions
+            ]
+            mixture_data_completions = [
+                template.render(messages=completion)
+                for completion in mixture_data_completions
+            ]
+
+        probability = self.judge.judge(
+            prompts,
+            list(zip(model_data_completions, mixture_data_completions, strict=True)),
+            return_scores=True,
+        )
+        return torch.tensor(probability, device=model_data["input_ids"].device)
+
+    def _compute_logprobs(self, model, model_data, context_length):
+        def compute_logprobs_for_data(m, data):
+            output = m(data["input_ids"], attention_mask=data["attention_mask"])
+            logits = output.logits[:, context_length - 1 : -1]
+            token_logprobs = selective_log_softmax(
+                logits, data["input_ids"][:, context_length:]
+            )
+            return token_logprobs
+
+        # Compute logprobs for model completions under the model
+        model_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+
+        # Compute logprobs of model completions under the reference model
+        with torch.no_grad():
+            if self.ref_model is None:
+                with model.disable_adapter():
+                    ref_logprobs_model_data = compute_logprobs_for_data(
+                        model, model_data
+                    )
+            else:
+                ref_logprobs_model_data = compute_logprobs_for_data(
+                    self.ref_model, model_data
+                )
+
+        # Mask padding tokens
+        model_padding_mask = model_data["attention_mask"][:, context_length:] == 0
+        model_logprobs_model_data = model_logprobs_model_data.masked_fill(
+            model_padding_mask, 0.0
+        )
+        ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(
+            model_padding_mask, 0.0
+        )
+
+        return (model_logprobs_model_data, ref_logprobs_model_data)
+
+    def _compute_losses(
+        self,
+        model_logprobs_model_data,
+        ref_logprobs_model_data,
+        probability,
+    ):
+        # reinforce score where 0.5 is a control variate
+        score = (probability - 0.5) * model_logprobs_model_data.sum(1)
+
+        # kl divergence via reinforce
+        with torch.no_grad():
+            log_ratio = model_logprobs_model_data - ref_logprobs_model_data
+            kl_div_log = log_ratio.sum(1)
+        kl_div_loss = (log_ratio * model_logprobs_model_data).sum(1)
+
+        # final loss
+        loss = self.beta * kl_div_loss - score
+
+        return loss.mean(), score, kl_div_log
+
+    def _log_statistics(
+        self,
+        model_data,
+        mixture_data,
+        model_logprobs_model_data,
+        ref_logprobs_model_data,
+        probability,
+        score,
+        kl_div,
+        context_length,
+        model_scores=None,
+        mixture_scores=None,
+    ):
+        # Helper function to gather and compute mean
+        def gather_mean(tensor):
+            return self.accelerator.gather_for_metrics(tensor).mean().item()
+
+        # Log score
+        self.stats["loss/score"].append(gather_mean(score))
+        # Log KL divergence
+        self.stats["loss/kl"].append(gather_mean(kl_div))
+
+        # Log logprobs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+
+        self.stats["logps/chosen"].append(gather_mean(model_logprobs_model_data_sum))
+        self.stats["logps/rejected"].append(gather_mean(ref_logprobs_model_data_sum))
+
+        # Log rewards
+        if self.reward_funcs is not None:
+            self.stats["rewards/chosen"].append(gather_mean(model_scores))
+            self.stats["rewards/rejected"].append(gather_mean(mixture_scores))
+
+        # Log probabilities
+        self.stats["rewards/probabilities"].append(gather_mean(probability))
+
+        # Calculate entropy for model data
+        entropy_model_data = -model_logprobs_model_data.sum(1)
+        self.stats["objective/entropy"].append(gather_mean(entropy_model_data))
+
+        # Calculate margins
+        margin = model_logprobs_model_data_sum - ref_logprobs_model_data_sum
+        self.stats["rewards/margins"].append(gather_mean(margin))
+
+        # Calculate accuracy
+        accuracy = (margin > 0).float()
+        self.stats["rewards/accuracies"].append(gather_mean(accuracy))
+
+        # Log EOS token statistics
+        model_eos = (
+            model_data["input_ids"][:, context_length:]
+            == self.processing_class.eos_token_id
+        ).any(dim=1)
+        mixture_eos = (
+            mixture_data["input_ids"][:, context_length:]
+            == self.processing_class.eos_token_id
+        ).any(dim=1)
+        self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float()))
+        self.stats["val/ref_contain_eos_token"].append(gather_mean(mixture_eos.float()))
+
+        # Log beta and mixture coef
+        self.stats["beta"].append(self.beta)
+        self.stats["mixture_coef"].append(self.mixture_coef)
+
+    def training_step(
+        self,
+        model: nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        num_items_in_batch: int | None = None,
+    ) -> torch.Tensor:
+        model.train()
+
+        # Apply chat template and tokenize the input
+        batch_size = len(next(iter(inputs.values())))
+        prompts = inputs["prompt"]
+        inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
+        inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
+        inputs = [
+            self.tokenize_row(
+                x, self.model.config.is_encoder_decoder, self.processing_class
+            )
+            for x in inputs
+        ]
+        inputs = self.data_collator(inputs)
+
+        # need the prompt_ only
+        inputs = self._prepare_inputs(inputs)
+        context_length = inputs["prompt_input_ids"].shape[1]
+        prompts = {
+            "input_ids": inputs["prompt_input_ids"],
+            "attention_mask": inputs["prompt_attention_mask"],
+            "raw": prompts,
+        }
+        del inputs
+
+        # Sample completions from both the model and the reference model
+        model_output, mixture_output = self._generate_completions(model, prompts)
+
+        # Process model completions
+        model_data, mixture_data = self._process_completions(
+            model_output, mixture_output, prompts
+        )
+
+        # Compute rewards
+        if self.reward_funcs is not None:
+            model_scores, mixture_scores = self._compute_rewards(
+                model_data, mixture_data, context_length
+            )
+            # probability of the model data vs the mixture data
+            probability = F.sigmoid(model_scores - mixture_scores)
+        else:
+            model_scores, mixture_scores = None, None
+            probability = self._compute_judge(model_data, mixture_data, context_length)
+
+        # Compute logprobs
+        model_logprobs_model_data, ref_logprobs_model_data = self._compute_logprobs(
+            model, model_data, context_length
+        )
+
+        # Compute loss
+        loss, score, kl_div = self._compute_losses(
+            model_logprobs_model_data, ref_logprobs_model_data, probability
+        )
+
+        # Log everything
+        self._log_statistics(
+            model_data,
+            mixture_data,
+            model_logprobs_model_data.detach(),
+            ref_logprobs_model_data,
+            probability,
+            score.detach(),
+            kl_div.detach(),
+            context_length,
+            model_scores,
+            mixture_scores,
+        )
+
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            empty_cache()
+
+        kwargs = {}
+        # For LOMO optimizers you need to explicitly use the learning rate
+        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            kwargs["learning_rate"] = self._get_learning_rate()
+
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+
+        self.accelerator.backward(loss, **kwargs)
+
+        return loss.detach() / self.args.gradient_accumulation_steps
diff --git a/src/aixpert/training/training/trl/trainer/online_dpo_config.py b/src/aixpert/training/training/trl/trainer/online_dpo_config.py
new file mode 100644
index 0000000..ac11c1d
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/online_dpo_config.py
@@ -0,0 +1,396 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class OnlineDPOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`OnlineDPOTrainer`].
+
+    This class includes only the parameters that are specific to Online DPO training. For a full list of training
+    arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this
+    class may differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        reward_model_path (`str`, *optional*):
+            Path to the reward model. Either `judge` or `reward_model_path` must be set, but not both.
+        judge (`str`, *optional*):
+            Name of the judge to use. Either `judge` or `reward_model_path` must be set, but not both.
+        max_new_tokens (`int`, *optional*, defaults to `64`):
+            Maximum number of tokens to generate per completion.
+        max_length (`int`, *optional*, defaults to `256`):
+            Maximum total length of the sequence (prompt + completion) used to compute log probabilities. If the
+            sequence exceeds this limit, the leftmost tokens will be truncated to preserve as much of the completion as
+            possible.
+        temperature (`float`, *optional*, defaults to `0.9`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        missing_eos_penalty (`float`, *optional*):
+            Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage to
+            generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive
+            value. This parameter only works when using `reward_funcs` and not when using `judge`.
+        beta (`float` or `list[float]`, *optional*, defaults to `0.1`):
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
+            the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β is
+            selected for each new epoch and the last β is used for the rest of the epochs.
+        loss_type (`str`, *optional*, defaults to `"sigmoid"`):
+            Type of loss to use. Possible values are:
+
+                - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+                - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model and reference model.
+
+        > Parameters that control generation
+
+        top_p (`float`, *optional*, defaults to `1.0`):
+            Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
+            `1.0` to consider all tokens.
+        top_k (`int`, *optional*):
+            Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is
+            disabled and all tokens are considered.
+        min_p (`float`, *optional*):
+            Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
+            value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range.
+        repetition_penalty (`float`, *optional*, defaults to `1.0`):
+            Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
+            Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat
+            tokens.
+        use_transformers_paged (`bool`, *optional*, defaults to `False`):
+            Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers`
+            paged implementation will be used for generation instead of the default padded implementation. This
+            parameter is only effective when `use_vllm` is set to `False`.
+        cache_implementation (`str`, *optional*):
+            Implementation of the cache method for faster generation when `use_vllm` is set to `False`.
+        generation_kwargs (`dict[str, Any]`, *optional*):
+            Additional keyword arguments to pass to [`~transformers.GenerationConfig`] (if using transformers) or
+            `SamplingParams` (if using vLLM) when sampling completions. This can be used to further customize the
+            generation behavior, such as setting `suppress_tokens`, `num_beams`, etc. If it contains keys that conflict
+            with the other generation parameters (like `min_p`, `top_p`, etc.), they will override them.
+
+        > Parameters that control generation acceleration powered by vLLM
+
+        use_vllm (`bool`, *optional*, defaults to `False`):
+            Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation
+            instead of the default model.generate(). Requires `vllm` to be installed.
+        vllm_model_impl (`str`, *optional*, defaults to `"vllm"`):
+            Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use
+            the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model
+            implementation.
+        vllm_mode (`str`, *optional*, defaults to `"server"`):
+            Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or
+            `"colocate"`.
+
+            - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM
+              server is running (start with `trl vllm-serve`).
+            - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a
+              separate server but may cause resource contention with training.
+        vllm_guided_decoding_regex (`str`, *optional*):
+            Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled.
+
+        > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)
+
+        vllm_server_base_url (`str`, *optional*):
+            Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and
+            `vllm_server_port` are ignored.
+        vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`):
+            Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+        vllm_server_port (`int`, *optional*, defaults to `8000`):
+            Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+        vllm_server_timeout (`float`, *optional*, defaults to `240.0`):
+            Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the
+            timeout, a `ConnectionError` is raised.
+
+        > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)
+
+        vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.55`):
+            Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to
+            `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+            launching the vLLM server via the `--vllm_gpu_memory_utilization` flag.
+        vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
+            Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to
+            `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+            launching the vLLM server via the `--vllm_tensor_parallel_size` flag.
+
+        > Other parameters
+
+        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
+            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+            improving generation speed. However, disabling this option allows training models that exceed the VRAM
+            capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
+            with vLLM generation.
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+    """
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=5e-7,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    reward_model_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "Path to the reward model. Either `judge` or `reward_model_path` must be set, but not both."
+        },
+    )
+    judge: str | None = field(
+        default=None,
+        metadata={
+            "help": "Name of the judge to use. Either `judge` or `reward_model_path` must be set, but not both."
+        },
+    )
+    max_new_tokens: int = field(
+        default=64,
+        metadata={"help": "Maximum number of tokens to generate per completion."},
+    )
+    max_length: int = field(
+        default=512,
+        metadata={
+            "help": "Maximum total length of the sequence (prompt + completion) used to compute log probabilities. If "
+            "the sequence exceeds this limit, the leftmost tokens will be truncated to preserve as much of the "
+            "completion as possible."
+        },
+    )
+    temperature: float = field(
+        default=0.9,
+        metadata={
+            "help": "Temperature for sampling. The higher the temperature, the more random the completions."
+        },
+    )
+    top_p: float = field(
+        default=1.0,
+        metadata={
+            "help": "Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. "
+            "Set to 1.0 to consider all tokens."
+        },
+    )
+    top_k: int | None = field(
+        default=None,
+        metadata={
+            "help": "Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, "
+            "top-k-filtering is disabled and all tokens are considered."
+        },
+    )
+    min_p: float | None = field(
+        default=None,
+        metadata={
+            "help": "Minimum token probability, which will be scaled by the probability of the most likely token. It "
+            "must be a value between 0.0 and 1.0. Typical values are in the 0.01-0.2 range."
+        },
+    )
+    repetition_penalty: float = field(
+        default=1.0,
+        metadata={
+            "help": "Float that penalizes new tokens based on whether they appear in the prompt and the generated "
+            "text so far. Values > 1.0 encourage the model to use new tokens, while values < 1.0 encourage the model "
+            "to repeat tokens."
+        },
+    )
+    generation_kwargs: dict | None = field(
+        default=None,
+        metadata={
+            "help": "Additional keyword arguments to pass to `GenerationConfig` (if using transformers) or "
+            "`SamplingParams` (if using vLLM) when sampling completions. This can be used to further customize the "
+            "generation behavior, such as setting `suppress_tokens`, `num_beams`, etc. If it contains keys that "
+            "conflict with the other generation parameters (like `min_p`, `top_p`, etc.), they will override them."
+        },
+    )
+    use_transformers_paged: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use the `transformers` paged implementation for generation. If set to `True`, the "
+            "`transformers` paged implementation will be used for generation instead of the default padded "
+            "implementation. This parameter is only effective when `use_vllm` is set to `False`."
+        },
+    )
+    cache_implementation: str | None = field(
+        default=None,
+        metadata={
+            "help": "Implementation of the cache method for faster generation when use_vllm is set to False."
+        },
+    )
+    missing_eos_penalty: float | None = field(
+        default=None,
+        metadata={
+            "help": "Penalty applied to the score when the model fails to generate an EOS token. This is useful to "
+            "encourage to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be "
+            "a positive value."
+        },
+    )
+    beta: list[float] = field(
+        default_factory=lambda: [0.1],
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from "
+            "the reference model. For the IPO loss (`loss_type='ipo'`), β is the regularization parameter denoted by "
+            "τ in the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β "
+            "is selected for each new epoch and the last β is used for the rest of the epochs."
+        },
+    )
+    loss_type: str = field(
+        default="sigmoid",
+        metadata={
+            "help": "Type of loss to use.",
+            "choices": ["sigmoid", "ipo"],
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+    use_vllm: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use vLLM for generating completions. Requires vLLM to be installed "
+            "(`pip install trl[vllm]`)."
+        },
+    )
+    vllm_model_impl: str = field(
+        default="vllm",
+        metadata={
+            "help": "Model implementation to use for vLLM. Must be one of `transformers` or `vllm`. `transformers`: "
+            "Use the `transformers` backend for model implementation. `vllm`: Use the `vllm` library for "
+            "model implementation."
+        },
+    )
+    vllm_guided_decoding_regex: str | None = field(
+        default=None,
+        metadata={
+            "help": "Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled."
+        },
+    )
+    vllm_gpu_memory_utilization: float | None = field(
+        default=0.55,
+        metadata={
+            "help": "Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set "
+            "to `'colocate'`. If you are using `vllm_mode='server'`, this parameter must be passed separately when "
+            "launching the vLLM server via the `--vllm_gpu_memory_utilization` flag.",
+        },
+    )
+    vllm_mode: str = field(
+        default="server",
+        metadata={
+            "help": "Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `'server'` or "
+            "`'colocate'`. `'server'`: The trainer will send generation requests to a separate vLLM server. Make sure "
+            "a TRL vLLM server is running (start with `trl vllm-serve`). `'colocate'`: vLLM will run in the same "
+            "process and share the training GPUs. This avoids the need for a separate server but may cause resource "
+            "contention with training.",
+        },
+    )
+    vllm_server_base_url: str | None = field(
+        default=None,
+        metadata={
+            "help": "Base URL for the vLLM server (e.g., 'http://localhost:8000'). If provided, `vllm_server_host` "
+            "and `vllm_server_port` are ignored.",
+        },
+    )
+    vllm_server_host: str = field(
+        default="0.0.0.0",
+        metadata={
+            "help": "Host of the vLLM server to connect to. Ignored if vllm_server_base_url is provided."
+        },
+    )
+    vllm_server_port: int = field(
+        default=8000,
+        metadata={
+            "help": "Port of the vLLM server to connect to. Ignored if vllm_server_base_url is provided."
+        },
+    )
+    vllm_server_timeout: float = field(
+        default=240.0,
+        metadata={
+            "help": "Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up "
+            "after the timeout, a `ConnectionError` is raised.",
+        },
+    )
+    vllm_tensor_parallel_size: int = field(
+        default=1,
+        metadata={
+            "help": "Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set "
+            "to `'colocate'`. If you are using `vllm_mode='server'`, this parameter must be passed separately when "
+            "launching the vLLM server via the `--vllm_tensor_parallel_size` flag.",
+        },
+    )
+    ds3_gather_for_generation: bool = field(
+        default=True,
+        metadata={
+            "help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for "
+            "generation, improving generation speed. However, disabling this option allows training models that "
+            "exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option "
+            "is not compatible with vLLM generation."
+        },
+    )
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    reward_weights: list[float] | None = field(
+        default=None,
+        metadata={
+            "help": "Weights for combining multiple reward functions. Must match the number of reward functions. "
+            "If None, all reward functions are equally weighted."
+        },
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        super().__post_init__()
+
+        if hasattr(self.beta, "__len__") and len(self.beta) == 1:
+            self.beta = self.beta[0]
+
+        if self.max_new_tokens >= self.max_length:
+            warnings.warn(
+                f"The configuration has `max_new_tokens` ({self.max_new_tokens}) >= `max_length` ({self.max_length}). "
+                "This will cause prompts to be truncated or completely removed in the forward pass. "
+                "To preserve prompts, ensure  e.g. `max_length > max_new_tokens + 512`. ",
+            )
diff --git a/src/aixpert/training/training/trl/trainer/online_dpo_trainer.py b/src/aixpert/training/training/trl/trainer/online_dpo_trainer.py
new file mode 100644
index 0000000..840f344
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/online_dpo_trainer.py
@@ -0,0 +1,1797 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import textwrap
+import warnings
+from collections.abc import Callable
+from contextlib import nullcontext
+from functools import wraps
+from pathlib import Path
+from typing import Any
+
+import jinja2
+import torch
+import torch.nn.functional as F
+import torch.utils.data
+import transformers
+from accelerate import logging
+from accelerate.utils import broadcast_object_list, gather_object, is_peft_model
+from datasets import Dataset
+from packaging import version
+from torch import nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.utils.data import DataLoader, IterableDataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollator,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    Trainer,
+    TrainerCallback,
+)
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
+)
+from transformers.trainer_utils import EvalPrediction, seed_worker
+from transformers.training_args import OptimizerNames
+from transformers.utils import (
+    is_flash_attn_2_available,
+    is_peft_available,
+    is_sagemaker_mp_enabled,
+)
+
+from ..data_utils import (
+    apply_chat_template,
+    is_conversational,
+    maybe_apply_chat_template,
+)
+from ..extras.profiling import profiling_context
+from ..extras.vllm_client import VLLMClient
+from ..import_utils import is_vllm_available
+from ..models import (
+    create_reference_model,
+    prepare_deepspeed,
+    prepare_fsdp,
+    prepare_peft_model,
+    unwrap_model_for_generation,
+)
+from .base_trainer import BaseTrainer
+from .online_dpo_config import OnlineDPOConfig
+from .utils import (
+    SIMPLE_CHAT_TEMPLATE,
+    DPODataCollatorWithPadding,
+    disable_dropout_in_model,
+    empty_cache,
+    ensure_master_addr_port,
+    get_config_model_id,
+    pad,
+    truncate_right,
+)
+
+
+if is_peft_available():
+    from peft import PeftConfig, PeftModel
+
+
+if is_sagemaker_mp_enabled():
+    from smdistributed.modelparallel import __version__ as SMP_VERSION
+
+    IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10")
+
+else:
+    IS_SAGEMAKER_MP_POST_1_10 = False
+
+
+if is_vllm_available():
+    from vllm import LLM, SamplingParams
+    from vllm.sampling_params import GuidedDecodingParams
+
+
+logger = logging.get_logger(__name__)
+
+# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of
+# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model.
+RewardFunc = str | PreTrainedModel | Callable[[list, list], list[float]]
+
+
+class OnlineDPOTrainer(BaseTrainer):
+    r"""
+    Initialize OnlineDPOTrainer.
+
+    Args:
+        model (`str | nn.Module | PreTrainedModel`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        ref_model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `None`):
+            The reference model to use for training. If None is specified, the reference model will be created from the
+            model.
+        judge ([`experimental.judges.BasePairwiseJudge`]):
+            The judge to use for pairwise comparison of model completions.
+        reward_funcs (`RewardFunc | list[RewardFunc]`, *optional*):
+            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+            functions with the prompts and completions and sum the rewards. Can be either:
+
+            - A single reward function: Can be a string (path to model), a [`~transformers.PreTrainedModel`], or a
+              custom callable function.
+            - A list of reward functions: Must all be of compatible types.
+
+            Note: Only one of `judge`, or `reward_funcs` should be provided.
+        args ([`OnlineDPOConfig`]):
+            The online DPO config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Dataset | IterableDataset]`):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        reward_processing_classes ([`~transformers.PreTrainedTokenizerBase`] or `list[PreTrainedTokenizerBase]`, *optional*):
+            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+
+            - A single processing class: Used when `reward_funcs` contains only one reward function.
+            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+
+            If set to `None`, the tokenizer for each model-based reward function is automatically loaded using
+            [`~transformers.AutoTokenizer.from_pretrained`].
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+    """
+
+    _tag_names = ["trl", "online-dpo"]
+    _name = "Online DPO"
+    _paper = {
+        "title": "Direct Language Model Alignment from Online AI Feedback",
+        "id": "2402.04792",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{guo2024direct,
+                title        = {{Direct Language Model Alignment from Online AI Feedback}},
+                author       = {Shangmin Guo and Biao Zhang and Tianlin Liu and Tianqi Liu and Misha Khalman and Felipe Llinares and Alexandre Ram{\'{e}} and Thomas Mesnard and Yao Zhao and Bilal Piot and Johan Ferret and Mathieu Blondel},
+                year         = 2024,
+                eprint       = {arXiv:2402.04792}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str,
+        ref_model: PreTrainedModel | nn.Module | None = None,
+        reward_funcs: RewardFunc | list[RewardFunc] | None = None,
+        judge=None,
+        args: OnlineDPOConfig | None = None,
+        data_collator: DataCollator | None = None,
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset
+        | IterableDataset
+        | dict[str, Dataset | IterableDataset]
+        | None = None,
+        processing_class: PreTrainedTokenizerBase | ProcessorMixin | None = None,
+        reward_processing_classes: PreTrainedTokenizerBase
+        | list[PreTrainedTokenizerBase]
+        | None = None,
+        peft_config: "PeftConfig | None" = None,
+        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+    ) -> None:
+        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+            warnings.warn(
+                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
+                "it and want it to remain, please share your comments here: "
+                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
+                "TRL_EXPERIMENTAL_SILENCE=1."
+            )
+        if ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, either omit the `ref_model` argument or pass `None`."
+            )
+
+        self.ref_model = ref_model
+
+        # Validate reward configuration - must have exactly one of: judge, or reward_funcs
+        reward_configs = sum(x is not None for x in [judge, reward_funcs])
+        if reward_configs == 0:
+            raise ValueError("One of `judge` or `reward_funcs` must be provided.")
+        if reward_configs > 1:
+            if judge is not None:
+                logger.warning(
+                    "Both `judge` and `reward_funcs` are provided. Using `judge` and ignoring `reward_funcs`.",
+                    UserWarning,
+                )
+                reward_funcs = None
+        self.judge = judge
+
+        # Handle reward_funcs
+        if reward_funcs is not None:
+            if not isinstance(reward_funcs, list):
+                reward_funcs = [reward_funcs]
+            self.reward_func_names = []
+
+            # Process reward functions (convert strings to models, collect names)
+            model_init_kwargs = args.model_init_kwargs or {}
+            for i, reward_func in enumerate(reward_funcs):
+                if isinstance(reward_func, str):
+                    # Load model from string path
+                    reward_funcs[i] = (
+                        AutoModelForSequenceClassification.from_pretrained(
+                            reward_func, num_labels=1, **model_init_kwargs
+                        )
+                    )
+                if isinstance(reward_funcs[i], nn.Module):
+                    self.reward_func_names.append(
+                        get_config_model_id(reward_funcs[i].config).split("/")[-1]
+                    )
+                else:
+                    self.reward_func_names.append(reward_funcs[i].__name__)
+            self.reward_funcs = reward_funcs
+
+            # Handle reward processing classes for reward_funcs
+            if reward_processing_classes is None:
+                reward_processing_classes = [None] * len(reward_funcs)
+            elif not isinstance(reward_processing_classes, list):
+                reward_processing_classes = [reward_processing_classes]
+            elif len(reward_processing_classes) != len(reward_funcs):
+                raise ValueError(
+                    "The number of reward processing classes must match the number of reward functions."
+                )
+
+            self.reward_processing_classes = []
+            for reward_processing_class_i, reward_func in zip(
+                reward_processing_classes, reward_funcs, strict=True
+            ):
+                if isinstance(reward_func, PreTrainedModel):
+                    if reward_processing_class_i is None:
+                        reward_processing_class_i = AutoTokenizer.from_pretrained(
+                            reward_func.config._name_or_path
+                        )
+                    if reward_processing_class_i.pad_token_id is None:
+                        reward_processing_class_i.pad_token = (
+                            reward_processing_class_i.eos_token
+                        )
+                    # Set pad token ID on reward model config
+                    reward_func.config.pad_token_id = (
+                        reward_processing_class_i.pad_token_id
+                    )
+                self.reward_processing_classes.append(reward_processing_class_i)
+        else:
+            self.reward_funcs = None
+            self.reward_func_names = []
+            self.reward_processing_classes = []
+
+        # Handle reward_weights
+        if reward_funcs is not None:
+            if args.reward_weights is not None:
+                if len(args.reward_weights) != len(self.reward_funcs):
+                    raise ValueError(
+                        f"Number of reward weights ({len(args.reward_weights)}) must match number of reward "
+                        f"functions ({len(self.reward_funcs)})"
+                    )
+                self.reward_weights = torch.tensor(
+                    args.reward_weights, dtype=torch.float32
+                )
+            else:
+                self.reward_weights = torch.ones(
+                    len(self.reward_funcs), dtype=torch.float32
+                )
+        else:
+            self.reward_weights = None
+
+        if (
+            args.missing_eos_penalty is not None
+            and reward_funcs is None
+            and judge is None
+        ):
+            raise ValueError(
+                "`missing_eos_penalty` is only supported when `reward_funcs` is provided."
+            )
+
+        if args is None:
+            raise ValueError("`args` must be provided.")
+
+        # Check that the processing_class is provided
+        if processing_class is None:
+            raise ValueError("`processing_class` must be provided.")
+
+        model_init_kwargs = args.model_init_kwargs or {}
+        if isinstance(model, str):
+            model_id = model
+
+            # Handle dtype in model_init_kwargs
+            dtype = model_init_kwargs.get("dtype")
+            if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
+                pass
+            elif isinstance(dtype, str):
+                dtype = getattr(torch, dtype)
+                model_init_kwargs["dtype"] = dtype
+            else:
+                raise ValueError(
+                    "Invalid `dtype` passed to `OnlineDPOConfig`. Expected either 'auto' or a string "
+                    f"representing a `torch.dtype` (e.g., 'float32'), but got {dtype}."
+                )
+
+            model = AutoModelForCausalLM.from_pretrained(model_id, **model_init_kwargs)
+        elif args.model_init_kwargs is not None:
+            raise ValueError(
+                "You passed `model_init_kwargs` to the `OnlineDPOConfig`, but your model is already instantiated. "
+                "This argument can only be used when the `model` argument is a string."
+            )
+        self.is_encoder_decoder = model.config.is_encoder_decoder
+        self.is_vision_model = (
+            model.config.model_type in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.keys()
+        )
+
+        if peft_config is not None or (
+            is_peft_available() and isinstance(model, PeftModel)
+        ):
+            model = prepare_peft_model(model, peft_config, args)
+
+        # Enable gradient checkpointing if requested
+        if args.gradient_checkpointing:
+            model = self._enable_gradient_checkpointing(model, args)
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        # Handle the ref_model
+        # Usually, the user wants the ref model to be the initial version of the model. When using PEFT, it's easy to
+        # get the ref model, as it's just the model with a disabled adapter. When not using PEFT, we need to create
+        # the ref model from the model by copying it and disable the gradients and set it in evaluation mode.
+        if ref_model is None:  # No ref model provided, the most common case
+            if peft_config is None:
+                self.ref_model = create_reference_model(
+                    model
+                )  # copy, disable gradients, set eval mode
+            else:
+                self.ref_model = None  # we don't need a ref model here, we can just disable the adapter.
+        else:  # rare case, the user provided a ref model
+            self.ref_model = ref_model
+            self.ref_model.eval()
+
+        # Disable the gradient and set the reward model in eval mode
+        if reward_funcs is not None:
+            for reward_func in reward_funcs:
+                if isinstance(reward_func, PreTrainedModel):
+                    reward_func.eval()
+
+        self.max_length = args.max_length
+
+        self.stats = {
+            "objective/kl": [],
+            "objective/entropy": [],
+            "objective/non_score_reward": [],
+            "rewards/chosen": [],
+            "rewards/rejected": [],
+            "rewards/accuracies": [],
+            "rewards/margins": [],
+            "logps/chosen": [],
+            "logps/rejected": [],
+            "val/contain_eos_token": [],
+            "beta": [],
+        }
+        if self.reward_funcs is not None:
+            self.stats["objective/rlhf_reward"] = []
+            self.stats["objective/scores_margin"] = []
+            self.stats["objective/scores"] = []
+
+        # Store generation parameters for later use
+        self.use_vllm = args.use_vllm
+        self.num_generations = 2  # Generate 2 completions per prompt for Online DPO
+        self.temperature = args.temperature
+        self.top_p = args.top_p
+        self.top_k = args.top_k
+        self.min_p = args.min_p
+        self.repetition_penalty = args.repetition_penalty
+        self.use_transformers_paged = args.use_transformers_paged
+        self.vllm_mode = args.vllm_mode if args.use_vllm else None
+        self.vllm_gpu_memory_utilization = args.vllm_gpu_memory_utilization
+        self.vllm_tensor_parallel_size = args.vllm_tensor_parallel_size
+        self.vllm_model_impl = args.vllm_model_impl
+
+        # Handle pad token for processors or tokenizers
+        if isinstance(processing_class, ProcessorMixin):
+            tokenizer = processing_class.tokenizer
+        elif isinstance(processing_class, PreTrainedTokenizerBase):
+            tokenizer = processing_class
+        else:
+            raise TypeError(
+                "The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`"
+            )
+
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        self.pad_token = tokenizer.pad_token
+        self.pad_token_id = tokenizer.pad_token_id
+        self.eos_token_id = tokenizer.eos_token_id
+
+        # Vision tokens for VLM support
+        self.image_token_id = getattr(processing_class, "image_token_id", None)
+        self.vision_start_token_id = getattr(
+            processing_class, "vision_start_token_id", None
+        )
+        self.vision_end_token_id = getattr(
+            processing_class, "vision_end_token_id", None
+        )
+        # Get the image token string for token collapsing
+        self.image_token = None
+        if self.image_token_id is not None:
+            self.image_token = tokenizer.decode([self.image_token_id])
+
+        # Define the collator if not provided
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(pad_token_id=self.pad_token_id)
+
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in Online DPO, the sampled data does not include
+        # the "input_ids" key. As a result, the trainer issues the warning: "Could not estimate the number of tokens
+        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
+        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
+        # that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        self._beta = args.beta
+
+        # Set up generation configuration and vLLM after super().__init__
+        if self.use_vllm:
+            if not is_vllm_available():
+                raise ImportError(
+                    "vLLM is not available and `use_vllm` is set to True. Please install vLLM with "
+                    "`pip install trl[vllm]` to use it."
+                )
+
+            if self.vllm_mode == "server":
+                if self.accelerator.is_main_process:
+                    if args.vllm_server_base_url is not None:
+                        base_url = args.vllm_server_base_url
+                    else:
+                        base_url = (
+                            f"http://{args.vllm_server_host}:{args.vllm_server_port}"
+                        )
+                    self.vllm_client = VLLMClient(
+                        base_url=base_url, connection_timeout=args.vllm_server_timeout
+                    )
+                    self.vllm_client.init_communicator(
+                        device=torch.cuda.current_device()
+                    )
+                else:
+                    self.vllm_client = None
+            elif self.vllm_mode == "colocate":
+                # vLLM dynamically adjusts the size of the key-value cache based on available GPU memory at instantiation.
+                # A larger cache size improves speed, so we would expect gpu_memory_utilization=1.
+                # However, at this stage, the optimizer's weights are not yet loaded onto the GPU; they will be loaded
+                # after the first optimizer step and remain in GPU memory throughout training. So we must reserve enough
+                # space for them.
+                # Configure vLLM parameters
+                vllm_kwargs = {
+                    "model": model.name_or_path,
+                    "tensor_parallel_size": self.vllm_tensor_parallel_size,
+                    "gpu_memory_utilization": self.vllm_gpu_memory_utilization,
+                    "model_impl": self.vllm_model_impl,
+                    "max_num_seqs": self.args.per_device_train_batch_size
+                    * self.vllm_tensor_parallel_size,
+                    "max_model_len": args.max_length
+                    + args.max_new_tokens,  # max_length includes prompt + completion
+                    "distributed_executor_backend": "external_launcher",
+                    # Feed identical seed for tp groups to ensure sampling results are the same across workers
+                    "seed": self.accelerator.process_index
+                    // self.vllm_tensor_parallel_size,
+                    # Latest vLLM v1 memory profiler is misled by the high default value (i.e., 32768)
+                    "max_num_batched_tokens": 4096,
+                }
+
+                # vLLM requires the environment variables to be set for distributed training.
+                os.environ["RANK"] = str(self.accelerator.process_index)
+                os.environ["LOCAL_RANK"] = str(self.accelerator.local_process_index)
+                os.environ["WORLD_SIZE"] = str(self.accelerator.num_processes)
+                # Ensure distributed rendezvous variables are set without colliding across concurrent runs
+                ensure_master_addr_port()
+
+                self.llm = LLM(**vllm_kwargs)
+            else:
+                raise ValueError(
+                    f"vllm_mode must be either 'server' or 'colocate', got '{self.vllm_mode}'."
+                )
+            # vLLM specific sampling arguments
+            self.guided_decoding_regex = args.vllm_guided_decoding_regex
+            self._last_loaded_step = (
+                -1
+            )  # tag to avoid useless loading during grad accumulation
+
+            # Set up vLLM generation config
+            generation_params = {
+                "n": 2,  # 2 generations per prompt for Online DPO
+                "repetition_penalty": self.repetition_penalty,
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "top_k": -1 if self.top_k is None else self.top_k,
+                "min_p": 0.0 if self.min_p is None else self.min_p,
+                "max_tokens": args.max_new_tokens,
+                "detokenize": False,  # to avoid vllm to decode (we don't need it)
+            }
+            if args.generation_kwargs is not None:
+                generation_params.update(args.generation_kwargs)
+            if self.guided_decoding_regex:
+                generation_params["guided_decoding"] = GuidedDecodingParams(
+                    regex=self.guided_decoding_regex
+                )
+            self.generation_config = SamplingParams(**generation_params)
+
+            # When using vLLM, the main process is responsible for loading the model weights. This can cause process
+            # desynchronization and seems to lead to DeepSpeed hanging during initialization. To prevent this, we
+            # synchronize all processes after vLLM has been fully initialized.
+            self.accelerator.wait_for_everyone()
+        else:
+            # Set up transformers generation config
+            generation_kwargs = {
+                "max_new_tokens": args.max_new_tokens,
+                "do_sample": True,
+                "pad_token_id": self.pad_token_id,
+                "bos_token_id": tokenizer.bos_token_id,
+                "eos_token_id": self.eos_token_id,
+                "temperature": self.temperature,
+                "top_k": self.top_k,
+                "top_p": self.top_p,
+                "repetition_penalty": self.repetition_penalty,
+                "use_cache": True if not self.args.gradient_checkpointing else False,
+            }
+            # Add min_p if supported
+            if self.min_p is not None:
+                generation_kwargs["min_p"] = self.min_p
+            if args.generation_kwargs is not None:
+                generation_kwargs.update(args.generation_kwargs)
+            # Remove None values
+            generation_kwargs = {
+                k: v for k, v in generation_kwargs.items() if v is not None
+            }
+            self.generation_config = GenerationConfig(**generation_kwargs)
+
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            elif self.is_fsdp_enabled:
+                self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(
+                    self.ref_model, evaluation_mode=True
+                )
+        if self.reward_funcs is not None:
+            for i, reward_func in enumerate(self.reward_funcs):
+                if isinstance(reward_func, PreTrainedModel):
+                    if self.is_deepspeed_enabled:
+                        self.reward_funcs[i] = prepare_deepspeed(
+                            reward_func, self.accelerator
+                        )
+                    else:
+                        # set device placement to True to make `prepare_model` move `reward_func` to device when using fsdp
+                        self.reward_funcs[i] = self.accelerator.prepare_model(
+                            reward_func, evaluation_mode=True, device_placement=True
+                        )
+
+    @property
+    def beta(self):
+        if isinstance(self._beta, list):
+            epoch = self.state.epoch
+            return self._beta[epoch] if epoch < len(self._beta) else self._beta[-1]
+        return self._beta
+
+    @staticmethod
+    def tokenize_row(
+        feature, is_encoder_decoder: bool, tokenizer: PreTrainedTokenizerBase
+    ) -> dict[str, Any]:
+        """Tokenize a single row from a DPO specific dataset."""
+        if not is_encoder_decoder:
+            batch = tokenizer(feature["prompt"], add_special_tokens=False)
+            # Add BOS token to head of prompt. Avoid adding if it's already there
+            if tokenizer.bos_token_id is not None:
+                prompt_len_input_ids = len(batch["input_ids"])
+                if (
+                    prompt_len_input_ids == 0
+                    or tokenizer.bos_token_id != batch["input_ids"][0]
+                ):
+                    batch["input_ids"] = [tokenizer.bos_token_id] + batch["input_ids"]
+                    batch["attention_mask"] = [1] + batch["attention_mask"]
+        else:
+            batch = tokenizer(feature["prompt"], add_special_tokens=True)
+        batch = {f"prompt_{key}": value for key, value in batch.items()}
+        return batch
+
+    # Same as Trainer.get_train_dataloader but skip the "remove_unused_columns".
+    @wraps(Trainer.get_train_dataloader)
+    def get_train_dataloader(self) -> DataLoader:
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+
+        train_dataset = self.train_dataset
+        data_collator = self.data_collator
+        dataloader_params = {
+            "batch_size": self._train_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+        }
+
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_train_sampler()
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = seed_worker
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
+
+        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
+
+    # Same as Trainer.get_eval_dataloader but skip the "remove_unused_columns".
+    @wraps(Trainer.get_eval_dataloader)
+    def get_eval_dataloader(
+        self, eval_dataset: str | Dataset | None = None
+    ) -> DataLoader:
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+
+        # If we have persistent workers, don't do a fork bomb especially as eval datasets
+        # don't change during training
+        dataloader_key = eval_dataset if isinstance(eval_dataset, str) else "eval"
+        if (
+            hasattr(self, "_eval_dataloaders")
+            and dataloader_key in self._eval_dataloaders
+            and self.args.dataloader_persistent_workers
+        ):
+            return self.accelerator.prepare(self._eval_dataloaders[dataloader_key])
+
+        eval_dataset = (
+            self.eval_dataset[eval_dataset]
+            if isinstance(eval_dataset, str)
+            else eval_dataset
+            if eval_dataset is not None
+            else self.eval_dataset
+        )
+        data_collator = self.data_collator
+
+        dataloader_params = {
+            "batch_size": self.args.eval_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+        }
+
+        if not isinstance(eval_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset)
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
+
+        # accelerator.free_memory() will destroy the references, so
+        # we need to store the non-prepared version
+        eval_dataloader = DataLoader(eval_dataset, **dataloader_params)
+        if self.args.dataloader_persistent_workers:
+            if hasattr(self, "_eval_dataloaders"):
+                self._eval_dataloaders[dataloader_key] = eval_dataloader
+            else:
+                self._eval_dataloaders = {dataloader_key: eval_dataloader}
+
+        return self.accelerator.prepare(eval_dataloader)
+
+    def _enable_gradient_checkpointing(
+        self, model: PreTrainedModel, args: OnlineDPOConfig
+    ) -> PreTrainedModel:
+        """Enables gradient checkpointing for the model."""
+        # Ensure use_cache is disabled
+        model.config.use_cache = False
+
+        # Enable gradient checkpointing on the base model for PEFT
+        if is_peft_model(model):
+            model.base_model.gradient_checkpointing_enable()
+        # Enable gradient checkpointing for non-PEFT models
+        else:
+            model.gradient_checkpointing_enable()
+
+        gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs or {}
+        use_reentrant = (
+            "use_reentrant" not in gradient_checkpointing_kwargs
+            or gradient_checkpointing_kwargs["use_reentrant"]
+        )
+
+        if use_reentrant:
+            model.enable_input_require_grads()
+
+        return model
+
+    def _generate_vllm(self, prompts, images=None):
+        eos_token_id = self.eos_token_id
+        pad_token_id = self.pad_token_id
+
+        # Generate completion_ids and prompt_ids based on mode
+        if self.vllm_mode == "server":
+            completion_ids, prompt_ids = self._generate_vllm_server(prompts, images)
+        elif self.vllm_mode == "colocate":
+            completion_ids, prompt_ids = self._generate_vllm_colocate(prompts, images)
+
+        # Shared padding, masking, and tensor conversion logic
+        max_prompt_length = max(len(ids) for ids in prompt_ids)
+        prompt_mask = [
+            [0] * (max_prompt_length - len(ids)) + [1] * len(ids) for ids in prompt_ids
+        ]
+        prompt_ids = [
+            [pad_token_id] * (max_prompt_length - len(ids)) + ids for ids in prompt_ids
+        ]
+        max_tokens = self.generation_config.max_tokens
+        completion_mask = [
+            [1] * len(ids) + [0] * (max_tokens - len(ids)) for ids in completion_ids
+        ]
+        completion_ids = [
+            ids + [eos_token_id]
+            if ids[-1] != eos_token_id and len(ids) < max_tokens
+            else ids
+            for ids in completion_ids
+        ]
+        completion_ids = [
+            ids + [pad_token_id] * (max_tokens - len(ids)) for ids in completion_ids
+        ]
+
+        # Convert to tensors
+        prompt_ids = torch.tensor(prompt_ids, device=self.accelerator.device)
+        prompt_mask = torch.tensor(prompt_mask, device=self.accelerator.device)
+        completion_ids = torch.tensor(completion_ids, device=self.accelerator.device)
+        completion_mask = torch.tensor(completion_mask, device=self.accelerator.device)
+
+        return prompt_ids, prompt_mask, completion_ids, completion_mask
+
+    def _generate_vllm_server(self, prompts, images=None):
+        """Generate completions using vLLM server mode"""
+        has_images = images is not None
+
+        # Update vLLM server weights if needed
+        if (
+            hasattr(self, "_last_loaded_step")
+            and self.state.global_step != self._last_loaded_step
+            or not hasattr(self, "_last_loaded_step")
+        ):
+            self._move_model_to_vllm()
+            self._last_loaded_step = self.state.global_step
+
+        # Apply chat template if conversational
+        if is_conversational({"prompt": prompts[0]}):
+            prompts_text = [
+                apply_chat_template({"prompt": p}, self.processing_class)["prompt"]
+                for p in prompts
+            ]
+        else:
+            prompts_text = prompts
+        # Gather all prompts to main process
+        all_prompts = gather_object(prompts_text)
+        if has_images:
+            all_images = gather_object(images)
+
+        if self.accelerator.is_main_process:
+            # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
+            # num_generations outputs for each one. This is faster than generating outputs for each duplicate
+            # prompt individually.
+            ordered_set_of_prompts = all_prompts[:: self.num_generations]
+            if has_images:
+                ordered_set_of_images = all_images[:: self.num_generations]
+            else:
+                ordered_set_of_images = None
+            completion_ids = self.vllm_client.generate(
+                prompts=ordered_set_of_prompts,
+                images=ordered_set_of_images,
+                n=self.num_generations,
+                repetition_penalty=self.repetition_penalty,
+                temperature=self.temperature,
+                top_p=self.top_p,
+                top_k=-1 if self.top_k is None else self.top_k,
+                min_p=0.0 if self.min_p is None else self.min_p,
+                max_tokens=self.generation_config.max_tokens,
+                guided_decoding_regex=self.guided_decoding_regex
+                if hasattr(self, "guided_decoding_regex")
+                else None,
+                generation_kwargs=self.args.generation_kwargs,
+            )
+            # Flatten: each prompt generates 2 completions
+            completion_ids = [
+                [comp_id]
+                for prompt_completions in completion_ids
+                for comp_id in prompt_completions
+            ]
+        else:
+            completion_ids = [None] * (len(all_prompts) * 2)
+
+        # Broadcast completions to all processes
+        completion_ids = broadcast_object_list(completion_ids, from_process=0)
+
+        # Each process takes its slice
+        process_slice = slice(
+            self.accelerator.process_index * len(prompts) * 2,
+            (self.accelerator.process_index + 1) * len(prompts) * 2,
+        )
+        completion_ids = completion_ids[process_slice]
+
+        # Create prompt_ids by tokenizing locally
+        prompt_inputs = self.processing_class(
+            text=prompts_text,
+            return_tensors="pt",
+            padding=True,
+            padding_side="left",
+            add_special_tokens=False,
+        )
+        prompt_ids = []
+        for prompt_tokens in prompt_inputs["input_ids"]:
+            prompt_ids.extend(
+                [prompt_tokens.tolist(), prompt_tokens.tolist()]
+            )  # 2 copies for 2 completions
+        return completion_ids, prompt_ids
+
+    def _generate_vllm_colocate(self, prompts, images=None):
+        """Generate completions using vLLM colocate mode"""
+        # Update model weights if needed - only after gradient accumulation completes
+        if self.state.global_step != self._last_loaded_step:
+            self._move_model_to_vllm()
+            self._last_loaded_step = self.state.global_step
+
+        # Apply chat template if conversational
+        if is_conversational({"prompt": prompts[0]}):
+            prompts_text = [
+                apply_chat_template({"prompt": p}, self.processing_class)["prompt"]
+                for p in prompts
+            ]
+        else:
+            prompts_text = prompts
+
+        # Prepare vLLM inputs with images if available
+        if images is not None:
+            vllm_inputs = []
+            for prompt, image in zip(prompts_text, images, strict=True):
+                if image is not None:
+                    vllm_inputs.append(
+                        {"prompt": prompt, "multi_modal_data": {"image": image}}
+                    )
+                else:
+                    vllm_inputs.append(prompt)
+        else:
+            vllm_inputs = prompts_text
+
+        outputs = self.llm.generate(vllm_inputs, self.generation_config, use_tqdm=False)
+
+        completion_ids = [
+            list(output.outputs[i].token_ids) for i in range(2) for output in outputs
+        ]
+        prompt_ids = [
+            list(output.prompt_token_ids) for _ in range(2) for output in outputs
+        ]
+
+        return completion_ids, prompt_ids
+
+    def _move_model_to_vllm(self):
+        """Synchronize model weights to vLLM server with support for PEFT, DeepSpeed, and FSDP"""
+        # For DeepSpeed ZeRO-3 and FSDP, we need to gather all parameters before operations
+        deepspeed_plugin = self.accelerator.state.deepspeed_plugin
+        zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
+        if zero_stage_3:
+            import deepspeed
+
+            gather_if_zero3 = deepspeed.zero.GatheredParameters
+        else:
+            gather_if_zero3 = nullcontext
+
+        if is_peft_model(self.model):
+            # With PEFT and FSDP/DeepSpeed ZeRO Stage 3, we must gather the full model at once before merging, as
+            # merging adapters in a sharded manner is not supported.
+            # TODO: does this work with FSDP?
+            with gather_if_zero3(list(self.model.parameters())):
+                self.model.merge_adapter()
+
+                # Update vLLM weights while parameters are gathered
+                if (
+                    self.is_fsdp_enabled
+                ):  # note if using FSDP, gather_if_zero3 is nullcontext
+                    # Update vLLM weights while parameters are gathered
+                    # For PEFT with FSDP we need to use the memory efficient post-order traversal
+                    fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None)
+                    fsdp_version = (
+                        getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1
+                    )
+                    if fsdp_version == 1:
+                        # use memory-efficient post-order traversal for FSDP
+                        self._sync_fsdp1_params_to_vllm(self.model)
+                    elif fsdp_version == 2:
+                        self._sync_fsdp2_params_to_vllm(self.model)
+                else:
+                    # DeepSpeed ZeRO-3 with PEFT
+                    for name, param in self.model.named_parameters():
+                        # When using PEFT, we need to recover the original parameter name and discard some parameters
+                        name = name.removeprefix("base_model.model.").replace(
+                            ".base_layer", ""
+                        )
+                        if self.model.prefix in name:
+                            continue
+                        # When module to save, remove its prefix and discard the original module
+                        if "original_module" in name:
+                            continue
+                        name = self._fix_param_name_to_vllm(
+                            name, extra_prefixes=["modules_to_save.default."]
+                        )
+
+                        if (
+                            self.vllm_mode == "server"
+                            and self.accelerator.is_main_process
+                        ):
+                            self.vllm_client.update_named_param(name, param.data)
+                        elif self.vllm_mode == "colocate":
+                            llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                            llm_model.load_weights([(name, param.data)])
+                # Unmerge adapters while parameters are still gathered
+                self.model.unmerge_adapter()
+                # Parameters will automatically be repartitioned when exiting the context
+        # For non-PEFT models, simply gather (if needed) and update each parameter individually.
+        elif self.is_fsdp_enabled:
+            fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None)
+            fsdp_version = getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1
+            if fsdp_version == 1:
+                self._sync_fsdp1_params_to_vllm(
+                    self.model
+                )  # use memory-efficient post-order traversal for FSDP
+            elif fsdp_version == 2:
+                self._sync_fsdp2_params_to_vllm(self.model)
+        else:
+            for name, param in self.model.named_parameters():
+                name = self._fix_param_name_to_vllm(name)
+                with gather_if_zero3([param]):
+                    if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(name, param.data)
+                    elif self.vllm_mode == "colocate":
+                        llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                        llm_model.load_weights([(name, param.data)])
+
+        # Reset cache on vLLM
+        if self.vllm_mode == "server" and self.accelerator.is_main_process:
+            self.vllm_client.reset_prefix_cache()
+        elif self.vllm_mode == "colocate":
+            self.llm.reset_prefix_cache()
+
+    def _sync_fsdp1_params_to_vllm(
+        self, module: nn.Module, prefix: str = "", visited=None
+    ):
+        """Memory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM."""
+        # For FSDP1, we need to recurse into children and also use summon_full_params
+        if visited is None:
+            visited = set()
+        for child_name, child_module in module.named_children():
+            child_prefix = f"{prefix}.{child_name}" if prefix else child_name
+            self._sync_fsdp1_params_to_vllm(
+                child_module, prefix=child_prefix, visited=visited
+            )  # recurse into the child
+
+        if isinstance(module, FSDP):
+            with FSDP.summon_full_params(module, recurse=False, writeback=False):
+                for param_name, param in module.named_parameters():
+                    full_name = f"{prefix}.{param_name}" if prefix else param_name
+                    full_name = self._fix_param_name_to_vllm(
+                        full_name, extra_prefixes=["_fsdp_wrapped_module."]
+                    )
+
+                    if full_name in visited:
+                        continue  # skip FSDP subtrees already traversed
+                    visited.add(full_name)
+
+                    if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(full_name, param.data)
+                    elif self.vllm_mode == "colocate":
+                        llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                        llm_model.load_weights([(full_name, param.data)])
+
+    def _sync_fsdp2_params_to_vllm(self, module: nn.Module):
+        # For FSDP2, module.state_dict() already covers all parameters, so no need for recursion
+        for name, param in module.state_dict().items():
+            if param.is_cpu:
+                param = param.to(torch.device("cuda"))
+            param = param.full_tensor()
+
+            if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                self.vllm_client.update_named_param(name, param)
+            elif self.vllm_mode == "colocate":
+                llm_model = (
+                    self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                )
+                llm_model.load_weights([(name, param)])
+
+    def _fix_param_name_to_vllm(self, name, extra_prefixes: list[str] | None = None):
+        """Clean parameter names for vLLM compatibility"""
+        extra_prefixes = extra_prefixes or []
+        prefixes = ["_checkpoint_wrapped_module."] + extra_prefixes
+        for prefix in prefixes:
+            name = name.replace(prefix, "")
+        return name
+
+    def process_vision_row(
+        self, features: dict[str, list | torch.Tensor], processing_class=None
+    ) -> dict[str, list[int]]:
+        """
+        Process a vision row for VLM models (adapted from DPO trainer)
+        """
+        processor = processing_class or self.processing_class
+        processed_features = processor(
+            images=[features["image"]],
+            text=features["prompt"],
+            add_special_tokens=False,
+        )
+
+        prompt_input_ids = processed_features["input_ids"][0]
+
+        # Create the output dict with required fields
+        output = {
+            "prompt_input_ids": prompt_input_ids,
+            "prompt_attention_mask": processed_features["attention_mask"][0],
+        }
+
+        # Add vision-specific fields
+        if "pixel_values" in processed_features:
+            output["pixel_values"] = processed_features["pixel_values"][0]
+        if "pixel_attention_mask" in processed_features:
+            output["pixel_attention_mask"] = processed_features["pixel_attention_mask"][
+                0
+            ]
+        if "image_sizes" in processed_features:
+            output["image_sizes"] = processed_features["image_sizes"][0]
+
+        return output
+
+    def _generate(self, model, prompts, images=None):
+        """Generate completions using the model"""
+        device = next(model.parameters()).device
+        eos_token_id = self.eos_token_id
+        pad_token_id = self.pad_token_id
+
+        # Apply chat template and tokenize the input
+        inputs = [{"prompt": prompt} for prompt in prompts]
+
+        # Add images if provided (VLM support)
+        if images is not None:
+            for i, image in enumerate(images):
+                inputs[i]["image"] = image
+
+        # Apply chat template to get text prompts
+        prompts_text = [
+            maybe_apply_chat_template(x, self.processing_class)["prompt"]
+            for x in inputs
+        ]
+
+        # Handle image token collapsing/removal
+        # The chat template sometimes inserts a single image token into the prompt text. However, when this text is
+        # later tokenized, the single image token string is expanded into multiple image token IDs, depending on the
+        # image size. We need to handle this properly.
+        if self.image_token is not None and images is not None:
+            escaped_img_token = re.escape(self.image_token)
+            # Search for the image token in the chat template
+            if (
+                hasattr(self.processing_class, "chat_template")
+                and self.processing_class.chat_template
+            ):
+                if re.search(escaped_img_token, self.processing_class.chat_template):
+                    # Collapse repeated image tokens back into a single token
+                    prompts_text = [
+                        re.sub(rf"({escaped_img_token})+", self.image_token, text)
+                        for text in prompts_text
+                    ]
+                # If the chat template doesn't use the image token, remove all instances
+                elif self.vision_end_token_id is not None:
+                    escaped_eoi_token = re.escape(
+                        self.processing_class.tokenizer.decode(
+                            [self.vision_end_token_id]
+                        )
+                    )
+                    prompts_text = [
+                        re.sub(rf"({escaped_img_token})+{escaped_eoi_token}", "", text)
+                        for text in prompts_text
+                    ]
+                else:
+                    # If vision_end_token_id is None, just remove the image tokens
+                    prompts_text = [
+                        re.sub(rf"({escaped_img_token})+", "", text)
+                        for text in prompts_text
+                    ]
+
+        # Prepare kwargs for processing class
+        kwargs = {}
+        if images is not None:
+            kwargs = {"images": [[img] for img in images]}
+
+        # Process inputs using the processing class (handles both VLM and LLM)
+        prompt_inputs = self.processing_class(
+            text=prompts_text,
+            return_tensors="pt",
+            padding=True,
+            padding_side="left",
+            add_special_tokens=False,
+            **kwargs,
+        )
+
+        prompt_inputs = {k: v.to(device) for k, v in prompt_inputs.items()}
+        # Convert vision inputs to model's dtype for proper computation
+        if "pixel_values" in prompt_inputs:
+            # Handle DataParallel wrapped models
+            model_dtype = getattr(model, "dtype", None)
+            if model_dtype is None and hasattr(model, "module"):
+                model_dtype = model.module.dtype
+            if model_dtype is not None:
+                prompt_inputs["pixel_values"] = prompt_inputs["pixel_values"].to(
+                    model_dtype
+                )
+
+        # Sample 2 completions per prompt of size `max_new_tokens` from the model
+        prompt_ids = prompt_inputs["input_ids"].repeat(2, 1)
+        prompt_mask = prompt_inputs["attention_mask"].repeat(2, 1)
+
+        # Prepare vision inputs if available
+        vision_generation_kwargs = {}
+        if self.is_vision_model and images is not None:
+            if "pixel_values" in prompt_inputs:
+                vision_generation_kwargs["pixel_values"] = prompt_inputs[
+                    "pixel_values"
+                ].repeat(2, 1, 1, 1)
+            if "pixel_attention_mask" in prompt_inputs:
+                vision_generation_kwargs["pixel_attention_mask"] = prompt_inputs[
+                    "pixel_attention_mask"
+                ].repeat(2, 1)
+            if "image_sizes" in prompt_inputs:
+                vision_generation_kwargs["image_sizes"] = prompt_inputs[
+                    "image_sizes"
+                ].repeat(2, 1)
+            if "image_grid_thw" in prompt_inputs:
+                vision_generation_kwargs["image_grid_thw"] = prompt_inputs[
+                    "image_grid_thw"
+                ].repeat(2, 1)
+
+        if self.use_transformers_paged:
+            previous_attn = self.model_wrapped.config._attn_implementation
+
+            if (
+                version.parse(transformers.__version__).release
+                >= version.parse("5.0.0").release
+            ):
+                new_attn = (
+                    "paged|flash_attention_2"
+                    if is_flash_attn_2_available()
+                    else "paged|sdpa"
+                )
+            else:
+                new_attn = (
+                    "paged_attention" if is_flash_attn_2_available() else "sdpa_paged"
+                )
+            self.model_wrapped.config._attn_implementation = new_attn
+            with (
+                profiling_context(self, "transformers.generate_batch"),
+                unwrap_model_for_generation(
+                    model,
+                    self.accelerator,
+                    gather_deepspeed3_params=self.args.ds3_gather_for_generation,
+                ) as unwrapped_model,
+                torch.no_grad(),
+                FSDP.summon_full_params(self.model_wrapped, recurse=False)
+                if self.is_fsdp_enabled
+                else nullcontext(),
+            ):
+                # Cast to the appropriate dtype based on training configuration
+                if self.args.bf16:
+                    unwrapped_model.to(torch.bfloat16)
+                elif self.args.fp16:
+                    unwrapped_model.to(torch.float16)
+                with torch.inference_mode():
+                    all_outputs = unwrapped_model.generate_batch(
+                        prompt_ids.tolist(),
+                        generation_config=self.generation_config,
+                        progress_bar=False,
+                    )
+                    unwrapped_model.train()  # restore training mode, as generate_batch forces eval mode
+            completion_ids = [
+                output.generated_tokens for output in all_outputs.values()
+            ]
+            completion_ids = [
+                torch.tensor(ids, device=device) for ids in completion_ids
+            ]
+            completion_ids = pad(
+                completion_ids, padding_value=self.pad_token_id, padding_side="right"
+            )
+            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+            # Restore the original attention implementation, training mode
+            self.model_wrapped.config._attn_implementation = previous_attn
+
+            # Extract completion_ids and create completion_mask
+            prompt_length = prompt_ids.size(1)
+            completion_ids = prompt_completion_ids[:, prompt_length:]
+            completion_ids, completion_mask = truncate_right(
+                completion_ids, eos_token_id, pad_token_id
+            )
+
+            return prompt_ids, prompt_mask, completion_ids, completion_mask
+        # Regular generation path
+        with (
+            profiling_context(self, "transformers.generate"),
+            unwrap_model_for_generation(
+                model,
+                self.accelerator,
+                gather_deepspeed3_params=self.args.ds3_gather_for_generation,
+            ) as unwrapped_model,
+            torch.no_grad(),
+            FSDP.summon_full_params(self.model_wrapped, recurse=False)
+            if self.is_fsdp_enabled
+            else nullcontext(),
+        ):
+            # Setup cache implementation if specified
+            if self.args.cache_implementation is not None:
+                unwrapped_model.generation_config.cache_implementation = (
+                    self.args.cache_implementation
+                )
+
+            # Standard generation
+            output = unwrapped_model.generate(
+                input_ids=prompt_ids,
+                attention_mask=prompt_mask,
+                generation_config=self.generation_config,
+                **vision_generation_kwargs,
+            )
+
+        completion_ids = output[:, prompt_ids.size(1) :]
+        completion_ids, completion_mask = truncate_right(
+            completion_ids, eos_token_id, pad_token_id
+        )
+
+        return prompt_ids, prompt_mask, completion_ids, completion_mask
+
+    def _calculate_rewards_from_functions(
+        self, prompts, completions, completion_ids_list, **reward_kwargs
+    ):
+        """
+        Calculate rewards using reward functions
+        """
+        device = self.accelerator.device
+        rewards_per_func = torch.zeros(
+            len(prompts), len(self.reward_funcs), device=device
+        )
+
+        # Add trainer state to reward kwargs for dynamic reward shaping
+        reward_kwargs["trainer_state"] = self.state
+
+        for i, (reward_func, reward_processing_class) in enumerate(
+            zip(self.reward_funcs, self.reward_processing_classes, strict=True)
+        ):
+            if isinstance(reward_func, nn.Module):  # Model-based reward function
+                # Handle conversational vs text input
+                if is_conversational({"prompt": prompts[0]}):
+                    messages = [
+                        {"messages": p + c}
+                        for p, c in zip(prompts, completions, strict=True)
+                    ]
+                    texts = [
+                        apply_chat_template(x, reward_processing_class)["text"]
+                        for x in messages
+                    ]
+                else:
+                    texts = [p + c for p, c in zip(prompts, completions, strict=True)]
+
+                # Tokenize and get reward scores
+                reward_inputs = reward_processing_class(
+                    text=texts,
+                    return_tensors="pt",
+                    padding=True,
+                    padding_side="right",
+                    add_special_tokens=False,
+                )
+                reward_inputs = {k: v.to(device) for k, v in reward_inputs.items()}
+
+                with torch.inference_mode():
+                    rewards_per_func[:, i] = reward_func(**reward_inputs).logits[
+                        :, 0
+                    ]  # Shape (B*G,)
+            else:
+                # Custom reward function
+                output_reward_func = reward_func(
+                    prompts=prompts,
+                    completions=completions,
+                    completion_ids=completion_ids_list,
+                    **reward_kwargs,
+                )
+                # Convert None values to NaN
+                output_reward_func = [
+                    reward if reward is not None else torch.nan
+                    for reward in output_reward_func
+                ]
+                rewards_per_func[:, i] = torch.tensor(
+                    output_reward_func, dtype=torch.float32, device=device
+                )
+
+        # Weight and sum across all reward functions
+        if self.reward_weights is not None:
+            total_rewards = (
+                rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
+            ).nansum(dim=1)
+        else:
+            total_rewards = rewards_per_func.nansum(dim=1)
+
+        return total_rewards
+
+    def _forward(
+        self,
+        model,
+        prompt_ids,
+        prompt_mask,
+        completion_ids,
+        completion_mask,
+        vision_inputs=None,
+    ):
+        # Get the number of tokens to truncate from prompt
+        num_tokens_to_truncate = max(
+            prompt_ids.size(1) + completion_ids.size(1) - self.max_length, 0
+        )
+
+        # Truncate left to avoid oom
+        prompt_ids = prompt_ids[:, num_tokens_to_truncate:]
+        prompt_mask = prompt_mask[:, num_tokens_to_truncate:]
+
+        # Concat the prompt and completion
+        prompt_completion_ids = torch.cat((prompt_ids, completion_ids), dim=1)
+        prompt_completion_mask = torch.cat((prompt_mask, completion_mask), dim=1)
+
+        # Prepare model kwargs with vision inputs if available
+        model_kwargs = {"attention_mask": prompt_completion_mask}
+        if vision_inputs is not None:
+            if "pixel_values" in vision_inputs:
+                model_kwargs["pixel_values"] = vision_inputs["pixel_values"]
+            if "pixel_attention_mask" in vision_inputs:
+                model_kwargs["pixel_attention_mask"] = vision_inputs[
+                    "pixel_attention_mask"
+                ]
+            if "image_sizes" in vision_inputs:
+                model_kwargs["image_sizes"] = vision_inputs["image_sizes"]
+            if "image_grid_thw" in vision_inputs:
+                model_kwargs["image_grid_thw"] = vision_inputs["image_grid_thw"]
+
+        # Get the logprobs of the completions from the model
+        output = model(prompt_completion_ids, **model_kwargs)
+
+        # There is 1 offset, because the model predicts the next token
+        prompt_len = prompt_ids.size(1)
+        start_idx = prompt_len - 1 if prompt_len > 0 else 0
+        # Only slice off the last logit when we have a prompt, otherwise we need all logits
+        end_idx = -1 if prompt_len > 0 else None
+        logits = output.logits[:, start_idx:end_idx]
+
+        # Take the completion tokens logprob
+        logprobs = torch.take_along_dim(
+            logits.log_softmax(dim=-1), completion_ids.unsqueeze(-1), dim=2
+        ).squeeze(-1)
+        return logprobs
+
+    def training_step(
+        self,
+        model: nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        num_items_in_batch: int | None = None,
+    ) -> torch.Tensor:
+        model.train()
+
+        prompts = inputs["prompt"]
+        batch_size = len(prompts)
+
+        # Handle images for VLM support
+        has_images = "image" in inputs
+        images = None
+        if has_images:
+            images = inputs["image"]
+            # Convert conversational prompts to include image tokens
+            for prompt in prompts:
+                if isinstance(prompt, list):
+                    for message in prompt:
+                        if not isinstance(message, dict):
+                            continue
+                        content = message.get("content")
+                        role = message.get("role")
+                        if isinstance(content, str):
+                            if role == "user":
+                                message["content"] = [
+                                    {"type": "image"},
+                                    {"type": "text", "text": content},
+                                ]
+                            elif role == "system":
+                                message["content"] = [{"type": "text", "text": content}]
+
+        if self.args.use_vllm:
+            prompt_ids, prompt_mask, completion_ids, completion_mask = (
+                self._generate_vllm(prompts, images)
+            )
+        else:
+            prompt_ids, prompt_mask, completion_ids, completion_mask = self._generate(
+                model, prompts, images
+            )
+
+        contain_eos_token = torch.any(completion_ids == self.eos_token_id, dim=-1)
+
+        # Extract vision inputs if available for VLM support
+        vision_inputs = None
+        if has_images and self.is_vision_model and not self.args.use_vllm:
+            # For vision models with transformers generation, we need to prepare vision inputs
+            # Process the images to get vision inputs that can be passed through the forward pass
+            vision_inputs = {}
+            kwargs = {"images": [[img] for img in images]}
+            processed = self.processing_class(
+                text=[""] * len(images),  # Dummy text for vision processing
+                return_tensors="pt",
+                **kwargs,
+            )
+            # Handle DataParallel wrapped models
+            model_device = getattr(model, "device", None)
+            model_dtype = getattr(model, "dtype", None)
+            if model_device is None and hasattr(model, "module"):
+                model_device = model.module.device
+                model_dtype = model.module.dtype
+            # Move vision tensors to device and convert to model dtype
+            # Need to duplicate for 2 completions per prompt
+            if "pixel_values" in processed:
+                vision_inputs["pixel_values"] = (
+                    processed["pixel_values"]
+                    .to(model_device, dtype=model_dtype)
+                    .repeat(2, 1, 1, 1)
+                )
+            if "pixel_attention_mask" in processed:
+                vision_inputs["pixel_attention_mask"] = (
+                    processed["pixel_attention_mask"].to(model_device).repeat(2, 1)
+                )
+            if "image_sizes" in processed:
+                vision_inputs["image_sizes"] = (
+                    processed["image_sizes"].to(model_device).repeat(2, 1)
+                )
+            if "image_grid_thw" in processed:
+                vision_inputs["image_grid_thw"] = (
+                    processed["image_grid_thw"].to(model_device).repeat(2, 1)
+                )
+
+        logprobs = self._forward(
+            model,
+            prompt_ids,
+            prompt_mask,
+            completion_ids,
+            completion_mask,
+            vision_inputs,
+        )
+        with torch.no_grad():
+            if self.ref_model is not None:
+                ref_logprobs = self._forward(
+                    self.ref_model,
+                    prompt_ids,
+                    prompt_mask,
+                    completion_ids,
+                    completion_mask,
+                    vision_inputs,
+                )
+            else:  # peft case: we just need to disable the adapter
+                with self.model.disable_adapter():
+                    ref_logprobs = self._forward(
+                        self.model,
+                        prompt_ids,
+                        prompt_mask,
+                        completion_ids,
+                        completion_mask,
+                        vision_inputs,
+                    )
+
+        # Decode the completions, and format them if the input is conversational
+        device = logprobs.device
+        completions = self.processing_class.batch_decode(
+            completion_ids, skip_special_tokens=True
+        )
+        if is_conversational({"prompt": prompts[0]}):
+            completions = [
+                [{"role": "assistant", "content": completion}]
+                for completion in completions
+            ]
+
+        # Get the reward from reward functions or judge
+        if self.reward_funcs is not None:
+            # First create completion_ids_list for custom reward functions
+            completion_ids_list = [
+                completion_ids[i].tolist() for i in range(completion_ids.shape[0])
+            ]
+
+            # Extract additional fields from inputs for reward functions
+            reward_kwargs = {}
+            keys = [key for key in inputs if key not in ["prompt"]]
+            for key in keys:
+                if isinstance(inputs[key], (list, tuple)):
+                    # Repeat input fields to match number of completions (2 per prompt)
+                    reward_kwargs[key] = inputs[key] * 2
+                else:
+                    reward_kwargs[key] = inputs[key]
+
+            # Calculate rewards using reward functions
+            rewards = self._calculate_rewards_from_functions(
+                prompts=2 * prompts,
+                completions=completions,
+                completion_ids_list=completion_ids_list,
+                **reward_kwargs,
+            )
+
+            # Apply missing EOS penalty if configured
+            if self.args.missing_eos_penalty is not None:
+                rewards[~contain_eos_token] -= self.args.missing_eos_penalty
+
+            # Split rewards into chosen/rejected pairs
+            first_half, second_half = rewards.split(batch_size)
+            mask = first_half >= second_half
+        elif self.judge is not None:
+            # Once formatted, conversational data may contain special tokens (such as <|im_start|>) that are not
+            # directly understandable by the judge and could alter its judgment. To avoid this and make the judge
+            # independent of the model's chat template, we use the raw conversation data, and apply our own chat
+            # template to it.
+            if is_conversational({"prompt": prompts[0]}):
+                environment = jinja2.Environment()
+                template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+                prompts = [template.render(messages=prompt) for prompt in prompts]
+                completions = [
+                    template.render(messages=completion) for completion in completions
+                ]
+
+            ranks_of_first_completion = self.judge.judge(
+                prompts,
+                list(
+                    zip(completions[:batch_size], completions[batch_size:], strict=True)
+                ),
+            )
+
+            # convert ranks to a True/False mask:
+            # when rank == 0, it means the first completion is the best
+            # when rank == 1, it means the second completion is the best
+            mask = torch.tensor(
+                [rank == 0 for rank in ranks_of_first_completion], device=device
+            )
+
+        batch_range = torch.arange(batch_size, device=device)
+        chosen_indices = batch_range + (~mask * batch_size)
+        rejected_indices = batch_range + (mask * batch_size)
+
+        # Build tensor so that the first half is the chosen examples and the second half the rejected examples
+        cr_indices = torch.cat(
+            (chosen_indices, rejected_indices), dim=0
+        )  # cr = chosen and rejected
+        cr_logprobs = logprobs[cr_indices]
+        cr_ref_logprobs = ref_logprobs[cr_indices]
+
+        # mask out the padding tokens
+        padding_mask = ~completion_mask.bool()
+        cr_padding_mask = padding_mask[cr_indices]
+
+        cr_logprobs_sum = (cr_logprobs * ~cr_padding_mask).sum(1)
+        cr_ref_logprobs_sum = (cr_ref_logprobs * ~cr_padding_mask).sum(1)
+
+        # Split the chosen and rejected examples
+        chosen_logprobs_sum, rejected_logprobs_sum = torch.split(
+            cr_logprobs_sum, batch_size
+        )
+        chosen_ref_logprobs_sum, rejected_ref_logprobs_sum = torch.split(
+            cr_ref_logprobs_sum, batch_size
+        )
+        pi_logratios = chosen_logprobs_sum - rejected_logprobs_sum
+        ref_logratios = chosen_ref_logprobs_sum - rejected_ref_logprobs_sum
+
+        logits = pi_logratios - ref_logratios
+
+        if self.args.loss_type == "sigmoid":
+            losses = -F.logsigmoid(self.beta * logits)
+        elif self.args.loss_type == "ipo":
+            losses = (logits - 1 / (2 * self.beta)) ** 2
+        else:
+            raise NotImplementedError(f"invalid loss type {self.loss_type}")
+
+        loss = losses.mean()
+
+        # Log everything
+        if self.reward_funcs is not None:
+            # When using reward_funcs, we have rewards instead of scores
+            scores_margin = rewards[chosen_indices] - rewards[rejected_indices]
+            self.stats["objective/scores_margin"].append(
+                self.accelerator.gather_for_metrics(scores_margin.mean()).mean().item()
+            )
+            self.stats["objective/scores"].append(
+                self.accelerator.gather_for_metrics(rewards.mean()).mean().item()
+            )
+        self.stats["val/contain_eos_token"].append(
+            contain_eos_token.float().mean().item()
+        )
+        self.stats["logps/chosen"].append(
+            self.accelerator.gather_for_metrics(chosen_logprobs_sum).mean().item()
+        )
+        self.stats["logps/rejected"].append(
+            self.accelerator.gather_for_metrics(rejected_logprobs_sum).mean().item()
+        )
+
+        kl = logprobs - ref_logprobs
+        mean_kl = kl.sum(1).mean()
+        self.stats["objective/kl"].append(
+            self.accelerator.gather_for_metrics(mean_kl).mean().item()
+        )
+        non_score_reward = (-self.beta * kl).sum(1)
+        mean_non_score_reward = non_score_reward.mean()
+        self.stats["objective/non_score_reward"].append(
+            self.accelerator.gather_for_metrics(mean_non_score_reward).mean().item()
+        )
+        if self.reward_funcs is not None:
+            # Calculate RLHF reward by combining rewards with non_score_reward
+            rlhf_reward = rewards + non_score_reward
+            self.stats["objective/rlhf_reward"].append(
+                self.accelerator.gather_for_metrics(rlhf_reward).mean().item()
+            )
+
+        mean_entropy = -logprobs.sum(1).mean()
+        self.stats["objective/entropy"].append(
+            self.accelerator.gather_for_metrics(mean_entropy).mean().item()
+        )
+        chosen_rewards = self.beta * (chosen_logprobs_sum - chosen_ref_logprobs_sum)
+        gathered_chosen_rewards = self.accelerator.gather_for_metrics(chosen_rewards)
+        self.stats["rewards/chosen"].append(gathered_chosen_rewards.mean().item())
+        rejected_rewards = self.beta * (
+            rejected_logprobs_sum - rejected_ref_logprobs_sum
+        )
+        gathered_rejected_rewards = self.accelerator.gather_for_metrics(
+            rejected_rewards
+        )
+        self.stats["rewards/rejected"].append(gathered_rejected_rewards.mean().item())
+        margin = gathered_chosen_rewards - gathered_rejected_rewards
+        self.stats["rewards/margins"].append(margin.mean().item())
+        accuracy = margin > 0
+        self.stats["rewards/accuracies"].append(accuracy.float().mean().item())
+        self.stats["beta"].append(self.beta)
+
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            empty_cache()
+
+        kwargs = {}
+
+        # For LOMO optimizers you need to explicitly use the learning rate
+        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            kwargs["learning_rate"] = self._get_learning_rate()
+
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+
+        self.accelerator.backward(loss, **kwargs)
+
+        return loss.detach() / self.args.gradient_accumulation_steps
+
+    # Same as Trainer._maybe_log_save_evaluate but log our metrics
+    def _maybe_log_save_evaluate(
+        self,
+        tr_loss,
+        grad_norm,
+        model,
+        trial,
+        epoch,
+        ignore_keys_for_eval,
+        start_time,
+        learning_rate=None,
+    ):
+        if (
+            self.control.should_log
+            and self.state.global_step > self._globalstep_last_logged
+        ):
+            logs: dict[str, float] = {}
+
+            # all_gather + mean() to get average loss over all processes
+            tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
+
+            # reset tr_loss to zero
+            tr_loss -= tr_loss
+
+            logs["loss"] = round(
+                tr_loss_scalar
+                / (self.state.global_step - self._globalstep_last_logged),
+                4,
+            )
+            if grad_norm is not None:
+                logs["grad_norm"] = (
+                    grad_norm.detach().item()
+                    if isinstance(grad_norm, torch.Tensor)
+                    else grad_norm
+                )
+            if learning_rate is not None:
+                logs["learning_rate"] = learning_rate
+            else:
+                logs["learning_rate"] = self._get_learning_rate()
+
+            # Add our metrics
+            for key, val in self.stats.items():
+                logs[key] = sum(val) / len(val)
+            self.stats = {key: [] for key in self.stats}  # reset stats
+
+            self._total_loss_scalar += tr_loss_scalar
+            self._globalstep_last_logged = self.state.global_step
+            self.store_flos()
+            self.log(logs, start_time)
+
+        metrics = None
+        if self.control.should_evaluate:
+            metrics = self._evaluate(trial, ignore_keys_for_eval)
+            is_new_best_metric = self._determine_best_metric(
+                metrics=metrics, trial=trial
+            )
+
+            if self.args.save_strategy == "best":
+                self.control.should_save = is_new_best_metric
+
+        if self.control.should_save:
+            self._save_checkpoint(model, trial)
+            self.control = self.callback_handler.on_save(
+                self.args, self.state, self.control
+            )
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/trainer/orpo_config.py b/src/aixpert/training/training/trl/trainer/orpo_config.py
new file mode 100644
index 0000000..a77b33a
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/orpo_config.py
@@ -0,0 +1,176 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class ORPOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`ORPOTrainer`].
+
+    This class includes only the parameters that are specific to ORPO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_completion_length (`int`, *optional*):
+            Maximum length of the completion. This argument is required if you want to use the default data collator
+            and your model is an encoder-decoder.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the relative ratio loss weight in the ORPO loss. In the
+            [paper](https://huggingface.co/papers/2403.07691), it is denoted by λ. In the
+            [code](https://github.com/xfactlab/orpo), it is denoted by `alpha`.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        label_pad_token_id (`int`, *optional*, defaults to `-100`):
+            Label pad token id. This argument is required if you want to use the default data collator.
+        padding_value (`int`, *optional*):
+            Padding value to use. If `None`, the padding value of the tokenizer is used.
+        truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
+        is_encoder_decoder (`bool`, *optional*):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        dataset_num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    max_length: int | None = field(
+        default=1024,
+        metadata={
+            "help": "Maximum length of the sequences (prompt + completion) in the batch."
+        },
+    )
+    max_prompt_length: int | None = field(
+        default=512,
+        metadata={
+            "help": "Maximum length of the prompt. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    max_completion_length: int | None = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the relative ratio loss weight in the ORPO loss. In the paper, it is "
+            "denoted by λ."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={
+            "help": "Label pad token id. This argument is required if you want to use the default data collator."
+        },
+    )
+    padding_value: int | None = field(
+        default=None,
+        metadata={
+            "help": "Padding value to use. If `None`, the padding value of the tokenizer is used."
+        },
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long.",
+            "choices": ["keep_end", "keep_start"],
+        },
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={
+            "help": "If `True`, generates and logs completions from the model to W&B during evaluation."
+        },
+    )
+    is_encoder_decoder: bool | None = field(
+        default=None,
+        metadata={
+            "help": "When using the `model_init` argument (callable) to instantiate the model instead of the `model` "
+            "argument, you need to specify if the model returned by the callable is an encoder-decoder model."
+        },
+    )
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        super().__post_init__()
diff --git a/src/aixpert/training/training/trl/trainer/orpo_trainer.py b/src/aixpert/training/training/trl/trainer/orpo_trainer.py
new file mode 100644
index 0000000..c857f34
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/orpo_trainer.py
@@ -0,0 +1,1244 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import random
+import textwrap
+import warnings
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import nullcontext
+from pathlib import Path
+from typing import Any, Literal
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from accelerate import PartialState, logging
+from datasets import Dataset
+from torch import autocast, nn
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoModelForCausalLM,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    is_comet_available,
+    is_torch_xla_available,
+    is_wandb_available,
+)
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalLoopOutput
+from transformers.utils import is_peft_available, is_torch_fx_proxy
+
+from ..data_utils import maybe_apply_chat_template, maybe_extract_prompt
+from .base_trainer import BaseTrainer
+from .orpo_config import ORPOConfig
+from .utils import (
+    DPODataCollatorWithPadding,
+    add_bos_token_if_needed,
+    add_eos_token_if_needed,
+    disable_dropout_in_model,
+    log_table_to_comet_experiment,
+    pad_to_length,
+    peft_module_casting_to_bf16,
+    selective_log_softmax,
+)
+
+
+if is_peft_available():
+    from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
+
+
+if is_wandb_available():
+    import wandb
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+
+logger = logging.get_logger(__name__)
+
+
+class ORPOTrainer(BaseTrainer):
+    r"""
+    Initialize ORPOTrainer.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an [`~transformers.AutoModelForSequenceClassification`].
+        args ([`ORPOConfig`]):
+            The ORPO config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+    """
+
+    _tag_names = ["trl", "orpo"]
+    _name = "ORPO"
+    _paper = {
+        "title": "ORPO: Monolithic Preference Optimization without Reference Model",
+        "id": "2403.07691",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{hong2024orpo,
+                title        = {{ORPO: Monolithic Preference Optimization without Reference Model}},
+                author       = {Jiwoo Hong and Noah Lee and James Thorne},
+                year         = 2024,
+                eprint       = {arXiv:2403.07691}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | str | None = None,
+        args: ORPOConfig | None = None,
+        data_collator: DataCollator | None = None,
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        model_init: Callable[[], PreTrainedModel] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+        peft_config: dict | None = None,
+        compute_metrics: Callable[[EvalLoopOutput], dict] | None = None,
+    ):
+        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+            warnings.warn(
+                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
+                "it and want it to remain, please share your comments here: "
+                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
+                "TRL_EXPERIMENTAL_SILENCE=1."
+            )
+        if args.model_init_kwargs is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError(
+                "You passed model_kwargs to the ORPOTrainer. But your model is already instantiated."
+            )
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            dtype = model_init_kwargs.get("dtype")
+            if dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(dtype, str) and dtype != "auto":
+                    dtype = getattr(torch, dtype)
+                if dtype != "auto" and not isinstance(dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `dtype` passed to the ORPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {dtype}."
+                    )
+                model_init_kwargs["dtype"] = dtype
+
+        if isinstance(model, str):
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16`
+        # has been called in order to properly call autocast if needed.
+        self._peft_has_been_casted_to_bf16 = False
+
+        if not is_peft_available() and peft_config is not None:
+            raise ValueError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        if is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_config, we merge and unload it first
+            if isinstance(model, PeftModel):
+                model = model.merge_and_unload()
+
+            if getattr(model, "is_loaded_in_8bit", False) or getattr(
+                model, "is_loaded_in_4bit", False
+            ):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+
+                prepare_model_kwargs = {
+                    "use_gradient_checkpointing": args.gradient_checkpointing
+                }
+
+                if _support_gc_kwargs:
+                    prepare_model_kwargs["gradient_checkpointing_kwargs"] = (
+                        args.gradient_checkpointing_kwargs
+                    )
+
+                model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+            elif args.gradient_checkpointing:
+                # For backward compatibility with older versions of transformers
+                if hasattr(model, "enable_input_require_grads"):
+                    model.enable_input_require_grads()
+                else:
+
+                    def make_inputs_require_grad(module, input, output):
+                        output.requires_grad_(True)
+
+                    model.get_input_embeddings().register_forward_hook(
+                        make_inputs_require_grad
+                    )
+
+            # get peft model with the given config
+            model = get_peft_model(model, peft_config)
+            if args.bf16 and getattr(model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(model)
+                # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager
+                self._peft_has_been_casted_to_bf16 = True
+
+        # For models that use gradient_checkpointing, we need to attach a hook that enables input
+        # to explicitly have `requires_grad=True`, otherwise training will either silently
+        # fail or completely fail.
+        elif args.gradient_checkpointing:
+            # For backward compatibility with older versions of transformers
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(
+                    make_inputs_require_grad
+                )
+
+        if args.generate_during_eval and not (
+            is_wandb_available() or is_comet_available()
+        ):
+            raise ValueError(
+                "`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve."
+            )
+
+        if model is not None:
+            self.is_encoder_decoder = model.config.is_encoder_decoder
+        elif args.is_encoder_decoder is None:
+            raise ValueError(
+                "When no model is provided, you need to pass the parameter is_encoder_decoder."
+            )
+        else:
+            self.is_encoder_decoder = args.is_encoder_decoder
+
+        if self.is_encoder_decoder:
+            self.decoder_start_token_id = model.config.decoder_start_token_id
+            self.pad_token_id = model.config.pad_token_id
+
+        if processing_class is None:
+            raise ValueError(
+                "processing_class must be specified to tokenize a ORPO dataset."
+            )
+        if args.max_length is None:
+            logger.warning(
+                "`max_length` is not set in the ORPOConfig's init"
+                " it will default to `512` by default, but you should do it yourself in the future.",
+            )
+            max_length = 512
+        else:
+            max_length = args.max_length
+        if args.max_prompt_length is None:
+            logger.warning(
+                "`max_prompt_length` is not set in the ORPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            max_prompt_length = 128
+        else:
+            max_prompt_length = args.max_prompt_length
+
+        if args.max_completion_length is None and self.is_encoder_decoder:
+            logger.warning(
+                "When using an encoder decoder architecture, you should set `max_completion_length` in the ORPOConfig's init"
+                " it will default to `128` by default, but you should do it yourself in the future.",
+            )
+            self.max_completion_length = 128
+        else:
+            self.max_completion_length = args.max_completion_length
+
+        if data_collator is None:
+            data_collator = DPODataCollatorWithPadding(
+                pad_token_id=processing_class.pad_token_id,
+                label_pad_token_id=args.label_pad_token_id,
+                is_encoder_decoder=self.is_encoder_decoder,
+            )
+
+            if args.remove_unused_columns:
+                args.remove_unused_columns = False
+                # warn users
+                logger.warning(
+                    "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments"
+                    " we have set it for you, but you should do it yourself in the future.",
+                )
+
+            self.use_dpo_data_collator = True
+        else:
+            self.use_dpo_data_collator = False
+
+        # Disable dropout in the model and reference model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+
+        self.max_length = max_length
+        self.generate_during_eval = args.generate_during_eval
+        self.label_pad_token_id = args.label_pad_token_id
+        self.padding_value = (
+            args.padding_value
+            if args.padding_value is not None
+            else processing_class.pad_token_id
+        )
+        self.max_prompt_length = max_prompt_length
+        self.truncation_mode = args.truncation_mode
+        self.processing_class = processing_class
+
+        self.beta = args.beta
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+        self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0)
+        if self.aux_loss_enabled and self.aux_loss_coef == 0.0:
+            logger.warning(
+                "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to "
+                "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value "
+                "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary "
+                "loss.",
+            )
+
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in ORPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and
+        # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens
+        # of the input, floating-point operations will not be computed." To suppress this warning, we set the
+        # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate
+        # that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Compute that only on the main process for faster data processing.
+        # see: https://github.com/huggingface/trl/pull/1255
+        with PartialState().main_process_first():
+            # Extract the prompt if needed, and apply the chat template if needed
+            train_dataset = train_dataset.map(
+                maybe_extract_prompt, num_proc=args.dataset_num_proc
+            )
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template,
+                fn_kwargs={"tokenizer": processing_class},
+                num_proc=args.dataset_num_proc,
+            )
+            train_dataset = train_dataset.map(
+                self.tokenize_row, num_proc=args.dataset_num_proc
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    maybe_extract_prompt, num_proc=args.dataset_num_proc
+                )
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template,
+                    fn_kwargs={"tokenizer": processing_class},
+                    num_proc=args.dataset_num_proc,
+                )
+                eval_dataset = eval_dataset.map(
+                    self.tokenize_row, num_proc=args.dataset_num_proc
+                )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if not hasattr(self, "accelerator"):
+            raise AttributeError(
+                "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`."
+            )
+
+    def build_tokenized_answer(self, prompt, answer):
+        """
+        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a +
+        b)[len(enc(a)):]`. Reference:
+            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+        """
+        full_tokenized = self.processing_class(
+            prompt + answer, add_special_tokens=False
+        )
+        prompt_input_ids = self.processing_class(prompt, add_special_tokens=False)[
+            "input_ids"
+        ]
+
+        answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :]
+        answer_attention_mask = full_tokenized["attention_mask"][
+            len(prompt_input_ids) :
+        ]
+
+        # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]`
+        full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids])
+
+        # Prepare input tokens for token by token comparison
+        full_input_ids = np.array(full_tokenized["input_ids"])
+
+        if len(full_input_ids) != len(full_concat_input_ids):
+            raise ValueError(
+                "Prompt input ids and answer input ids should have the same length."
+            )
+
+        # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens
+        # can be merged together when tokenizing prompt+answer. This could result
+        # on the last token from the prompt being different when tokenized on its own
+        # vs when done as prompt+answer.
+        response_token_ids_start_idx = len(prompt_input_ids)
+
+        # If tokenized prompt is different than both prompt+answer, then it means the
+        # last token has changed due to merging.
+        if (
+            prompt_input_ids
+            != full_tokenized["input_ids"][:response_token_ids_start_idx]
+        ):
+            response_token_ids_start_idx -= 1
+
+        prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx]
+        prompt_attention_mask = full_tokenized["attention_mask"][
+            :response_token_ids_start_idx
+        ]
+
+        if len(prompt_input_ids) != len(prompt_attention_mask):
+            raise ValueError(
+                "Prompt input ids and attention mask should have the same length."
+            )
+
+        answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:]
+        answer_attention_mask = full_tokenized["attention_mask"][
+            response_token_ids_start_idx:
+        ]
+
+        return dict(
+            prompt_input_ids=prompt_input_ids,
+            prompt_attention_mask=prompt_attention_mask,
+            input_ids=answer_input_ids,
+            attention_mask=answer_attention_mask,
+        )
+
+    def tokenize_row(
+        self, feature, model: PreTrainedModel | nn.Module | None = None
+    ) -> dict:
+        """Tokenize a single row from a ORPO specific dataset.
+
+        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
+        chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long,
+        we truncate the chosen/rejected.
+
+        We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length
+        of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens.
+        """
+        batch = {}
+        prompt = feature["prompt"]
+        chosen = feature["chosen"]
+        rejected = feature["rejected"]
+
+        if not self.is_encoder_decoder:
+            # Check issues below for more details
+            #  1. https://github.com/huggingface/trl/issues/907
+            #  2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
+            #  3. https://github.com/LianjiaTech/BELLE/issues/337
+
+            if not isinstance(prompt, str):
+                raise ValueError(f"prompt should be an str but got {type(prompt)}")
+            prompt_tokens = self.processing_class(prompt, add_special_tokens=False)
+            prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()}
+
+            if not isinstance(chosen, str):
+                raise ValueError(f"chosen should be an str but got {type(chosen)}")
+            chosen_tokens = self.build_tokenized_answer(prompt, chosen)
+
+            if not isinstance(rejected, str):
+                raise ValueError(f"rejected should be an str but got {type(rejected)}")
+            rejected_tokens = self.build_tokenized_answer(prompt, rejected)
+
+            # Last prompt token might get merged by tokenizer and
+            # it should not be included for generation if that happens
+            prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"])
+
+            chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"])
+            rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"])
+            prompt_len_input_ids = min(
+                chosen_prompt_len_input_ids, rejected_prompt_len_input_ids
+            )
+
+            for k, v in prompt_tokens.items():
+                prompt_tokens[k] = v[:prompt_len_input_ids]
+
+            # Make sure prompts only have one different token at most an
+            # and length only differs by 1 at most
+            num_diff_tokens = sum(
+                a != b
+                for a, b in zip(
+                    chosen_tokens["prompt_input_ids"],
+                    rejected_tokens["prompt_input_ids"],
+                    strict=True,
+                )
+            )
+            num_diff_len = abs(
+                chosen_prompt_len_input_ids - rejected_prompt_len_input_ids
+            )
+            if num_diff_tokens > 1 or num_diff_len > 1:
+                raise ValueError(
+                    "Chosen and rejected prompt_input_ids might only differ on the "
+                    "last token due to tokenizer merge ops."
+                )
+
+            # add BOS token to head of prompt. Avoid adding if it's already there
+            prompt_tokens, chosen_tokens, rejected_tokens = add_bos_token_if_needed(
+                self.processing_class.bos_token_id,
+                prompt_len_input_ids,
+                prompt_tokens,
+                chosen_prompt_len_input_ids,
+                chosen_tokens,
+                rejected_prompt_len_input_ids,
+                rejected_tokens,
+            )
+
+            # add EOS token to end of answer. Avoid adding if it's already there
+            chosen_tokens, rejected_tokens = add_eos_token_if_needed(
+                self.processing_class.eos_token_id, chosen_tokens, rejected_tokens
+            )
+
+            longer_response_length = max(
+                len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"])
+            )
+
+            # if combined sequence is too long, truncate the prompt
+            for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]:
+                if (
+                    len(answer_tokens["prompt_input_ids"]) + longer_response_length
+                    > self.max_length
+                ):
+                    if self.truncation_mode == "keep_start":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][
+                                : self.max_prompt_length
+                            ]
+                    elif self.truncation_mode == "keep_end":
+                        for k in ["prompt_input_ids", "prompt_attention_mask"]:
+                            answer_tokens[k] = answer_tokens[k][
+                                -self.max_prompt_length :
+                            ]
+                    else:
+                        raise ValueError(
+                            f"Unknown truncation mode: {self.truncation_mode}"
+                        )
+
+            # if that's still too long, truncate the response
+            for answer_tokens in [chosen_tokens, rejected_tokens]:
+                if (
+                    len(answer_tokens["prompt_input_ids"]) + longer_response_length
+                    > self.max_length
+                ):
+                    for k in ["input_ids", "attention_mask"]:
+                        answer_tokens[k] = answer_tokens[k][
+                            : self.max_length - self.max_prompt_length
+                        ]
+
+            # Create labels
+            chosen_sequence_tokens = {
+                k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k]
+                for k in ["input_ids", "attention_mask"]
+            }
+            rejected_sequence_tokens = {
+                k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k]
+                for k in ["input_ids", "attention_mask"]
+            }
+            chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:]
+            chosen_sequence_tokens["labels"][
+                : len(chosen_tokens["prompt_input_ids"])
+            ] = [self.label_pad_token_id] * len(chosen_tokens["prompt_input_ids"])
+            rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][
+                :
+            ]
+            rejected_sequence_tokens["labels"][
+                : len(rejected_tokens["prompt_input_ids"])
+            ] = [self.label_pad_token_id] * len(rejected_tokens["prompt_input_ids"])
+
+            for k, toks in {
+                "chosen_": chosen_sequence_tokens,
+                "rejected_": rejected_sequence_tokens,
+                "": prompt_tokens,
+            }.items():
+                for type_key, tokens in toks.items():
+                    if type_key == "token_type_ids":
+                        continue
+                    batch[f"{k}{type_key}"] = tokens
+
+        else:
+            chosen_tokens = self.processing_class(
+                chosen,
+                truncation=True,
+                max_length=self.max_completion_length,
+                add_special_tokens=True,
+            )
+            rejected_tokens = self.processing_class(
+                rejected,
+                truncation=True,
+                max_length=self.max_completion_length,
+                add_special_tokens=True,
+            )
+            prompt_tokens = self.processing_class(
+                prompt,
+                truncation=True,
+                max_length=self.max_prompt_length,
+                add_special_tokens=True,
+            )
+
+            batch["chosen_labels"] = chosen_tokens["input_ids"]
+            batch["rejected_labels"] = rejected_tokens["input_ids"]
+            batch["prompt_input_ids"] = prompt_tokens["input_ids"]
+            batch["prompt_attention_mask"] = prompt_tokens["attention_mask"]
+
+            if model is not None and hasattr(
+                model, "prepare_decoder_input_ids_from_labels"
+            ):
+                batch["rejected_decoder_input_ids"] = (
+                    model.prepare_decoder_input_ids_from_labels(
+                        labels=torch.tensor(batch["rejected_labels"])
+                    )
+                )
+                batch["chosen_decoder_input_ids"] = (
+                    model.prepare_decoder_input_ids_from_labels(
+                        labels=torch.tensor(batch["chosen_labels"])
+                    )
+                )
+
+        if is_torch_xla_available():
+            # Pad the sequences to global max_length to avoid TorchXLA recompilation
+            for k in batch:
+                if "labels" in k or self.is_encoder_decoder:
+                    pad_value = self.label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = self.padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                batch[k] = batch[k] + [pad_value] * (self.max_length - len(batch[k]))
+        return batch
+
+    @staticmethod
+    def concatenated_inputs(
+        batch: dict[str, list | torch.LongTensor],
+        is_encoder_decoder: bool = False,
+        label_pad_token_id: int = -100,
+        padding_value: int = 0,
+        device: torch.device | None = None,
+    ) -> dict[str, torch.LongTensor]:
+        """Concatenate the chosen and rejected inputs into a single tensor.
+
+        Args:
+            batch:
+                A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors
+                of shape (batch_size, sequence_length).
+            is_encoder_decoder:
+                Whether the model is an encoder-decoder model.
+            label_pad_token_id:
+                The label pad token id.
+            padding_value:
+                The padding value to use for the concatenated inputs_ids.
+            device:
+                The device for the concatenated inputs.
+
+        Returns
+        -------
+            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
+        """
+        concatenated_batch = {}
+
+        if is_encoder_decoder:
+            max_length = max(
+                batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1]
+            )
+        else:
+            max_length = max(
+                batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1]
+            )
+
+        for k in batch:
+            if k.startswith("chosen") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("chosen", "concatenated")
+                concatenated_batch[concatenated_key] = pad_to_length(
+                    batch[k], max_length, pad_value=pad_value
+                )
+        for k in batch:
+            if k.startswith("rejected") and isinstance(batch[k], torch.Tensor):
+                if "labels" in k or is_encoder_decoder:
+                    pad_value = label_pad_token_id
+                elif k.endswith("_input_ids"):
+                    pad_value = padding_value
+                elif k.endswith("_attention_mask"):
+                    pad_value = 0
+                concatenated_key = k.replace("rejected", "concatenated")
+                concatenated_batch[concatenated_key] = torch.cat(
+                    (
+                        concatenated_batch[concatenated_key],
+                        pad_to_length(batch[k], max_length, pad_value=pad_value),
+                    ),
+                    dim=0,
+                ).to(device=device)
+
+        if is_encoder_decoder:
+            concatenated_batch["concatenated_input_ids"] = (
+                batch["prompt_input_ids"].repeat(2, 1).to(device=device)
+            )
+            concatenated_batch["concatenated_attention_mask"] = (
+                batch["prompt_attention_mask"].repeat(2, 1).to(device=device)
+            )
+
+        return concatenated_batch
+
+    def odds_ratio_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+    ) -> tuple[
+        torch.FloatTensor,
+        torch.FloatTensor,
+        torch.FloatTensor,
+        torch.FloatTensor,
+        torch.FloatTensor,
+    ]:
+        """Compute ORPO's odds ratio (OR) loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns
+        -------
+            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the ORPO
+            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
+            the chosen and rejected responses, respectively. The log odds ratio of the chosen responses over the
+            rejected responses ratio for logging purposes. The `log(sigmoid(log_odds_chosen))` for logging purposes.
+        """
+        # Derived from Eqs. (4) and (7) from https://huggingface.co/papers/2403.07691 by using log identities and exp(log(P(y|x)) = P(y|x)
+        log_odds = (policy_chosen_logps - policy_rejected_logps) - (
+            torch.log1p(-torch.exp(policy_chosen_logps))
+            - torch.log1p(-torch.exp(policy_rejected_logps))
+        )
+        ratio = F.logsigmoid(log_odds)
+        losses = self.beta * ratio
+
+        chosen_rewards = (
+            self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach()
+        )
+        rejected_rewards = (
+            self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach()
+        )
+
+        return (
+            losses,
+            chosen_rewards,
+            rejected_rewards,
+            torch.mean(ratio),
+            torch.mean(log_odds),
+        )
+
+    @staticmethod
+    def get_batch_logps(
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+        label_pad_token_id: int = -100,
+        is_encoder_decoder: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels:
+                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
+                ignored. Shape: (batch_size, sequence_length)
+            average_log_prob:
+                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
+                log probabilities of the (non-masked) tokens.
+            label_pad_token_id: The label pad token id.
+            is_encoder_decoder: Whether the model is an encoder-decoder model.
+
+        Returns
+        -------
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
+            given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError(
+                "Logits (batch and sequence length dim) and labels must have the same shape."
+            )
+
+        if not is_encoder_decoder:
+            labels = labels[:, 1:].clone()
+            logits = logits[:, :-1, :]
+        loss_mask = labels != label_pad_token_id
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels = torch.where(labels == label_pad_token_id, 0, labels)
+
+        per_token_logps = selective_log_softmax(logits, labels)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        return (per_token_logps * loss_mask).sum(-1)
+
+    def concatenated_forward(
+        self, model: nn.Module, batch: dict[str, list | torch.LongTensor]
+    ) -> tuple[
+        torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor
+    ]:
+        """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+            padding_value=self.padding_value,
+            device=self.accelerator.device,
+        )
+        len_chosen = batch["chosen_labels"].shape[0]
+
+        model_kwargs = (
+            {
+                "decoder_input_ids": self._shift_right(
+                    concatenated_batch["concatenated_labels"]
+                ),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        outputs = model(
+            concatenated_batch["concatenated_input_ids"],
+            attention_mask=concatenated_batch["concatenated_attention_mask"],
+            use_cache=False,
+            **model_kwargs,
+        )
+        all_logits = outputs.logits
+
+        def cross_entropy_loss(logits, labels):
+            if not self.is_encoder_decoder:
+                # Shift so that tokens < n predict n
+                logits = logits[..., :-1, :].contiguous()
+                labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            logits = logits.view(-1, logits.shape[-1])
+            labels = labels.view(-1)
+            # Enable model parallelism
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits, labels)
+            return loss
+
+        if self.is_encoder_decoder:
+            labels = concatenated_batch["concatenated_labels"].clone()
+        else:
+            labels = concatenated_batch["concatenated_input_ids"].clone()
+            attention_mask = concatenated_batch["concatenated_attention_mask"]
+            labels = torch.where(attention_mask == 1, labels, self.label_pad_token_id)
+        # orpo chosen nll loss is computed over the full prompt and response
+        chosen_nll_loss = cross_entropy_loss(
+            all_logits[:len_chosen], labels[:len_chosen]
+        )
+
+        all_logps = self.get_batch_logps(
+            all_logits,
+            concatenated_batch["concatenated_labels"],
+            average_log_prob=True,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        chosen_logps = all_logps[:len_chosen]
+        rejected_logps = all_logps[len_chosen:]
+
+        if not self.is_encoder_decoder:
+            chosen_logits = all_logits[:len_chosen, :-1, :]
+            rejected_logits = all_logits[len_chosen:, :-1, :]
+        else:
+            chosen_logits = all_logits[:len_chosen]
+            rejected_logits = all_logits[len_chosen:]
+
+        if self.aux_loss_enabled:
+            return (
+                chosen_logps,
+                rejected_logps,
+                chosen_logits,
+                rejected_logits,
+                chosen_nll_loss,
+                outputs.aux_loss,
+            )
+
+        return (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        )
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, list | torch.LongTensor],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+
+        forward_output = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = forward_output[:5]
+        if self.aux_loss_enabled:
+            aux_loss = forward_output[5]
+
+        losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = (
+            self.odds_ratio_loss(policy_chosen_logps, policy_rejected_logps)
+        )
+        # full ORPO loss
+        loss = policy_nll_loss - losses.mean()
+
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(
+            chosen_rewards
+        ).mean()
+        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(
+            rejected_rewards
+        ).mean()
+        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(
+            reward_accuracies
+        ).mean()
+        metrics[f"{prefix}rewards/margins"] = self.accelerator.gather_for_metrics(
+            chosen_rewards - rejected_rewards
+        ).mean()
+        metrics[f"{prefix}logps/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean()
+        )
+        metrics[f"{prefix}logps/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean()
+        )
+        metrics[f"{prefix}logits/rejected"] = self.accelerator.gather_for_metrics(
+            policy_rejected_logits.detach().mean()
+        ).mean()
+        metrics[f"{prefix}logits/chosen"] = self.accelerator.gather_for_metrics(
+            policy_chosen_logits.detach().mean()
+        ).mean()
+        metrics[f"{prefix}nll_loss"] = (
+            self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean()
+        )
+        metrics[f"{prefix}log_odds_ratio"] = (
+            self.accelerator.gather_for_metrics(log_odds_ratio).detach().mean()
+        )
+        metrics[f"{prefix}log_odds_chosen"] = (
+            self.accelerator.gather_for_metrics(log_odds_chosen).detach().mean()
+        )
+        if is_torch_xla_available():
+            xm.mark_step()  # needed because .item() calls
+        for k, v in metrics.items():
+            metrics[k] = v.item()
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+    def compute_loss(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
+        compute_loss_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with compute_loss_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(
+                model, inputs, train_eval="train"
+            )
+
+        # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class:
+        loss = loss.to(self.args.device)
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="train")
+
+        if return_outputs:
+            return (loss, metrics)
+        return loss
+
+    def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str:
+        """Generate samples from the model and reference model for the given batch of inputs."""
+        # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with
+        # the torch amp context manager as some hidden states are silently casted to full precision.
+        generate_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with generate_context_manager:
+            policy_output = model.generate(
+                input_ids=batch["prompt_input_ids"],
+                attention_mask=batch["prompt_attention_mask"],
+                max_length=self.max_length,
+                do_sample=True,
+                pad_token_id=self.processing_class.pad_token_id,
+            )
+
+        policy_output = pad_to_length(
+            policy_output, self.max_length, self.processing_class.pad_token_id
+        )
+        policy_output_decoded = self.processing_class.batch_decode(
+            policy_output, skip_special_tokens=True
+        )
+
+        return policy_output_decoded
+
+    def prediction_step(
+        self,
+        model: PreTrainedModel | nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        prediction_loss_only: bool,
+        ignore_keys: list[str] | None = None,
+    ):
+        if not self.use_dpo_data_collator:
+            logger.warning(
+                "prediction_step is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than "
+                "DPODataCollatorWithPadding - you might see unexpected behavior. Alternatively, you can implement your own prediction_step method if you are using a custom data collator"
+            )
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        prediction_context_manager = (
+            autocast(self.accelerator.device.type)
+            if self._peft_has_been_casted_to_bf16
+            else nullcontext()
+        )
+
+        with torch.no_grad(), prediction_context_manager:
+            loss, metrics = self.get_batch_loss_metrics(
+                model, inputs, train_eval="eval"
+            )
+
+        # force log the metrics
+        self.store_metrics(metrics, train_eval="eval")
+
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        # logits for the chosen and rejected samples from model
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+        logits = [v for k, v in logits_dict.items() if k not in ignore_keys]
+        logits = torch.tensor(logits, device=self.accelerator.device)
+        labels = torch.zeros(logits.shape[0], device=self.accelerator.device)
+
+        return (loss.detach(), logits, labels)
+
+    def store_metrics(
+        self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train"
+    ) -> None:
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: bool | None = None,
+        ignore_keys: list[str] | None = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
+        `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(
+                range(num_samples), k=self.args.eval_batch_size
+            )
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded = self.generate_from_model(self.model, random_batch)
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy"],
+                data=[
+                    [prompt, pol[len(prompt) :]]
+                    for prompt, pol in zip(
+                        random_batch["prompt"], policy_output_decoded, strict=True
+                    )
+                ],
+            )
+            if "wandb" in self.args.report_to:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader,
+            description,
+            prediction_loss_only,
+            ignore_keys,
+            metric_key_prefix,
+        )
+
+        return initial_output
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`dict[str, float]`):
+                The values to log.
+            start_time (`float`, *optional*):
+                Start time of the training.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = torch.tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        return super().log(logs, start_time)
+
+    def _shift_right(self, input_ids):
+        if self.decoder_start_token_id is None:
+            raise ValueError(
+                "model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(
+                input_ids.shape[:-1] + (1,), self.decoder_start_token_id
+            )
+            shifted_input_ids = torch.cat(
+                [shifted_input_ids, input_ids[..., :-1]], dim=-1
+            )
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = self.decoder_start_token_id
+
+        if self.pad_token_id is None:
+            raise ValueError("model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, self.pad_token_id)
+
+        return shifted_input_ids
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/trainer/ppo_config.py b/src/aixpert/training/training/trl/trainer/ppo_config.py
new file mode 100644
index 0000000..79c1ad0
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/ppo_config.py
@@ -0,0 +1,140 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass, field
+from typing import Literal
+
+from ..trainer.utils import OnPolicyConfig
+
+
+@dataclass
+class PPOConfig(OnPolicyConfig):
+    r"""
+    Configuration class for the [`PPOTrainer`].
+
+    This class includes only the parameters that are specific to PPO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
+    values in this class may differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`):
+            Name of this experiment.
+        reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
+            Path to the reward model.
+        model_adapter_name (`str`, *optional*):
+            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
+        ref_adapter_name (`str`, *optional*):
+            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
+        num_ppo_epochs (`int`, *optional*, defaults to `4`):
+            Number of epochs to train.
+        whiten_rewards (`bool`, *optional*, defaults to `False`):
+            Whether to whiten the rewards.
+        kl_coef (`float`, *optional*, defaults to `0.05`):
+            KL coefficient.
+        kl_estimator (`Literal["k1", "k3"]`, *optional*, defaults to `"k1"`):
+            Which estimator for KL-Divergence to use from [Approximating KL
+            Divergence](http://joschu.net/blog/kl-approx.html). Defaults to "k1", a straightforward, unbiased
+            estimator. Can be set to "k3", an unbiased estimator with lower variance which "appears to be a strictly
+            better estimator". Cannot be set to "k2", as it is used for logging purposes.
+        cliprange (`float`, *optional*, defaults to `0.2`):
+            Clip range.
+        vf_coef (`float`, *optional*, defaults to `0.1`):
+            Value function coefficient.
+        cliprange_value (`float`, *optional*, defaults to `0.2`):
+            Clip range for the value function.
+        gamma (`float`, *optional*, defaults to `1.0`):
+            Discount factor.
+        lam (`float`, *optional*, defaults to `0.95`):
+            Lambda value for GAE.
+        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
+            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+            improving generation speed. However, disabling this option allows training models that exceed the VRAM
+            capacity of a single GPU, albeit at the cost of slower generation.
+    """
+
+    exp_name: str = field(
+        default=os.path.basename(__file__)[:-3],
+        metadata={"help": "Name of this experiment."},
+    )
+    reward_model_path: str = field(
+        default="EleutherAI/pythia-160m",
+        metadata={"help": "Path to the reward model."},
+    )
+    model_adapter_name: str | None = field(
+        default=None,
+        metadata={
+            "help": "Name of the train target PEFT adapter, when using LoRA with multiple adapters."
+        },
+    )
+    ref_adapter_name: str | None = field(
+        default=None,
+        metadata={
+            "help": "Name of the reference PEFT adapter, when using LoRA with multiple adapters."
+        },
+    )
+    num_ppo_epochs: int = field(
+        default=4,
+        metadata={"help": "Number of epochs to train."},
+    )
+    whiten_rewards: bool = field(
+        default=False,
+        metadata={"help": "Whether to whiten the rewards."},
+    )
+    kl_coef: float = field(
+        default=0.05,
+        metadata={"help": "KL coefficient."},
+    )
+    kl_estimator: Literal["k1", "k3"] = field(
+        default="k1",
+        metadata={
+            "help": "Which estimator for KL-Divergence to use from Approximating KL Divergence "
+            "(http://joschu.net/blog/kl-approx.html). Defaults to 'k1', a straightforward, unbiased estimator. Can be "
+            "set to 'k3', an unbiased estimator with lower variance which 'appears to be a strictly better "
+            "estimator'. Cannot be set to 'k2', as it is used for logging purposes."
+        },
+    )
+    cliprange: float = field(
+        default=0.2,
+        metadata={"help": "Clip range."},
+    )
+    vf_coef: float = field(
+        default=0.1,
+        metadata={"help": "Value function coefficient."},
+    )
+    cliprange_value: float = field(
+        default=0.2,
+        metadata={"help": "Clip range for the value function."},
+    )
+    gamma: float = field(
+        default=1.0,
+        metadata={"help": "Discount factor."},
+    )
+    lam: float = field(
+        default=0.95,
+        metadata={"help": "Lambda value for GAE."},
+    )
+    ds3_gather_for_generation: bool = field(
+        default=True,
+        metadata={
+            "help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for "
+            "generation, improving generation speed. However, disabling this option allows training models that "
+            "exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation."
+        },
+    )
diff --git a/src/aixpert/training/training/trl/trainer/ppo_trainer.py b/src/aixpert/training/training/trl/trainer/ppo_trainer.py
new file mode 100644
index 0000000..43f43aa
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/ppo_trainer.py
@@ -0,0 +1,1080 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import math
+import os
+import textwrap
+import time
+import warnings
+from collections import defaultdict
+from contextlib import contextmanager, nullcontext
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import torch
+from accelerate import Accelerator, logging
+from accelerate.utils import broadcast, gather_object
+from datasets import Dataset
+from torch import nn
+from torch.utils.data import DataLoader
+from transformers import (
+    BaseImageProcessor,
+    DataCollatorWithPadding,
+    FeatureExtractionMixin,
+    GenerationConfig,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainerCallback,
+    TrainerControl,
+)
+from transformers.integrations import get_reporting_integration_callbacks
+from transformers.trainer import DEFAULT_CALLBACKS, DEFAULT_PROGRESS_CALLBACK
+from transformers.trainer_callback import (
+    CallbackHandler,
+    ExportableState,
+    PrinterCallback,
+)
+from transformers.utils import is_peft_available, is_rich_available
+
+from ..models import create_reference_model
+from ..models.utils import unwrap_model_for_generation
+from .base_trainer import BaseTrainer
+from .ppo_config import PPOConfig
+from .utils import (
+    OnlineTrainerState,
+    batch_generation,
+    disable_dropout_in_model,
+    empty_cache,
+    exact_div,
+    first_true_indices,
+    forward,
+    get_reward,
+    log_table_to_comet_experiment,
+    peft_module_casting_to_bf16,
+    prepare_deepspeed,
+    print_rich_table,
+    selective_log_softmax,
+    truncate_response,
+)
+
+
+logger = logging.get_logger(__name__)
+
+if is_peft_available():
+    from peft import PeftConfig, PeftModel, get_peft_model
+
+
+INVALID_LOGPROB = 1.0
+
+
+def masked_mean(
+    values: torch.Tensor, mask: torch.Tensor, axis: bool | None = None
+) -> torch.Tensor:
+    """Compute mean of tensor with a masked values."""
+    if axis is not None:
+        return (values * mask).sum(axis=axis) / mask.sum(axis=axis)
+    return (values * mask).sum() / mask.sum()
+
+
+def masked_var(
+    values: torch.Tensor, mask: torch.Tensor, unbiased: bool = True
+) -> torch.Tensor:
+    """Compute variance of tensor with masked values."""
+    mean = masked_mean(values, mask)
+    centered_values = values - mean
+    variance = masked_mean(centered_values**2, mask)
+    if unbiased:
+        mask_sum = mask.sum()
+        if mask_sum == 0:
+            raise ValueError(
+                "The sum of the mask is zero, which can happen when `mini_batch_size=1`;"
+                "try increase the `mini_batch_size` or `gradient_accumulation_steps`"
+            )
+        # note that if mask_sum == 1, then there is a division by zero issue
+        # to avoid it you just need to use a larger minibatch_size
+        bessel_correction = mask_sum / (mask_sum - 1)
+        variance = variance * bessel_correction
+    return variance
+
+
+def masked_whiten(
+    values: torch.Tensor, mask: torch.Tensor, shift_mean: bool = True
+) -> torch.Tensor:
+    """Whiten values with masked values."""
+    mean, var = masked_mean(values, mask), masked_var(values, mask)
+    whitened = (values - mean) * torch.rsqrt(var + 1e-8)
+    if not shift_mean:
+        whitened += mean
+    return whitened
+
+
+# taken from https://github.com/OpenLMLab/MOSS-RLHF/blob/40b91eb2f2b71b16919addede0341d2bef70825d/ppo/ppo_trainer.py#L29
+# we did this we can do a single `model = accelerator.prepare(model)`
+class PolicyAndValueWrapper(nn.Module):
+    def __init__(self, policy, value_model) -> None:
+        super().__init__()
+        self.policy = policy
+        self.value_model = value_model
+        self.critic_backbone = getattr(value_model, value_model.base_model_prefix)
+        self.is_gradient_checkpointing = policy.is_gradient_checkpointing
+
+    def forward(self, **kwargs):
+        output = self.critic_backbone(**kwargs)
+        logits = self.value_model.score(output.hidden_states[-1])
+        return self.policy(**kwargs), logits
+
+
+class PPOTrainer(BaseTrainer):
+    """Trainer for Proximal Policy Optimization (PPO).
+
+    For details on PPO, see the paper: [Proximal Policy Optimization
+    Algorithms](https://huggingface.co/papers/1707.06347).
+
+    Args:
+        args ([`PPOConfig`]):
+            Training arguments.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`]):
+            Class to process the data.
+        model (`torch.nn.Module`):
+            Model to be trained. This is the policy model.
+        ref_model (`torch.nn.Module`, *optional*):
+            Reference model used to compute the KL divergence. If `None`, a copy of the policy model is created.
+        reward_model (`torch.nn.Module`):
+            Reward model used to compute the rewards.
+        train_dataset ([`~datasets.Dataset`]):
+            Dataset for training.
+        value_model (`torch.nn.Module`):
+            Value model used to predict the value of a state.
+        data_collator ([`~transformers.DataCollatorWithPadding`], *optional*):
+            Data collator to batch and pad samples from the dataset. If `None`, a default data collator is created
+            using the `processing_class`.
+        eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
+            Dataset for evaluation.
+        optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
+            Tuple containing the optimizer and the learning rate scheduler to use for training. If `None`, the
+            optimizer and the learning rate scheduler are created using the
+            [`~transformers.Trainer.create_optimizer_and_scheduler`] method.
+        callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
+            Callbacks to use during training.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the policy `model`
+            will be wrapped with the specified PEFT adapter.
+    """
+
+    _tag_names = ["trl", "ppo"]
+    _name = "PPO"
+    _paper = {
+        "title": "Fine-Tuning Language Models from Human Preferences",
+        "id": "1909.08593",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{mziegler2019fine-tuning,
+                title        = {{Fine-Tuning Language Models from Human Preferences}},
+                author       = {Daniel M. Ziegler and Nisan Stiennon and Jeffrey Wu and Tom B. Brown and Alec Radford and Dario Amodei and Paul F. Christiano and Geoffrey Irving},
+                year         = 2019,
+                eprint       = {arXiv:1909.08593}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        args: PPOConfig,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin,
+        model: nn.Module,
+        ref_model: nn.Module | None,
+        reward_model: nn.Module,
+        train_dataset: Dataset,
+        value_model: nn.Module,
+        data_collator: DataCollatorWithPadding | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        # less commonly used
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        callbacks: list[TrainerCallback] | None = None,
+        peft_config: "PeftConfig | None" = None,
+    ) -> None:
+        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+            warnings.warn(
+                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
+                "it and want it to remain, please share your comments here: "
+                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
+                "TRL_EXPERIMENTAL_SILENCE=1."
+            )
+        if ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must make a copy of it, or `None` if you use peft."
+            )
+
+        self.args = args
+        self.processing_class = processing_class
+        self.policy_model = model
+
+        # Define the collator if not provided
+        if data_collator is None:
+            data_collator = DataCollatorWithPadding(self.processing_class)
+
+        # Handle stop token settings: update policy model's generation_config to use provided stop token
+        if args.stop_token and args.stop_token_id:
+            raise ValueError("You cannot set both `stop_token` and `stop_token_id`.")
+        if args.stop_token:
+            if args.stop_token == "eos":
+                self.policy_model.generation_config.eos_token_id = (
+                    self.stop_token_id
+                ) = processing_class.eos_token_id
+            else:
+                raise ValueError(
+                    f"Unknown `stop_token` {args.stop_token}. Allowed values are: `'eos'` and `None` (no stop token)."
+                )
+        else:
+            self.policy_model.generation_config.eos_token_id = self.stop_token_id = (
+                args.stop_token_id
+            )  # None or int
+
+        # Check that the kl estimator is valid
+        if self.args.kl_estimator not in {"k1", "k3"}:
+            raise ValueError(
+                "kl_estimator must be either 'k1' (straightforward, unbiased) or 'k3' (lower variance, unbiased, "
+                "appears to be a strictly better estimator). See "
+                "[Approximating KL Divergence](http://joschu.net/blog/kl-approx.html) for details."
+            )
+
+        # peft support
+        if not is_peft_available() and peft_config is not None:
+            raise ImportError(
+                "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models"
+            )
+        if is_peft_available() and peft_config is not None:
+            # if model is a peft model and we have a peft_confg, we merge and unload it first
+            if isinstance(self.policy_model, PeftModel):
+                self.policy_model = self.policy_model.merge_and_unload()
+
+            # get peft model with the given config
+            self.policy_model = get_peft_model(self.policy_model, peft_config)
+            if args.bf16 and getattr(self.policy_model, "is_loaded_in_4bit", False):
+                peft_module_casting_to_bf16(self.policy_model)
+
+        self.is_peft_model = is_peft_available() and isinstance(
+            self.policy_model, PeftModel
+        )
+        self.model_adapter_name = args.model_adapter_name
+        self.ref_adapter_name = args.ref_adapter_name
+
+        if ref_model:
+            self.ref_model = ref_model
+        elif self.is_peft_model:
+            self.ref_model = None
+        else:
+            self.ref_model = create_reference_model(self.policy_model)
+
+        self.reward_model = reward_model
+        self.train_dataset = train_dataset
+        self.train_dataset_len = len(train_dataset)
+        self.value_model = value_model
+        self.data_collator = data_collator
+        self.eval_dataset = eval_dataset
+        self.optimizer, self.lr_scheduler = optimizers
+        self.optimizer_cls_and_kwargs = None  # needed for transformers >= 4.47
+
+        #########
+        # calculate various batch sizes
+        #########
+        if (
+            args.total_episodes is None
+        ):  # allow the users to define episodes in terms of epochs.
+            args.total_episodes = int(args.num_train_epochs * self.train_dataset_len)
+        accelerator = Accelerator(
+            gradient_accumulation_steps=args.gradient_accumulation_steps
+        )
+        self.accelerator = accelerator
+        args.world_size = accelerator.num_processes
+        args.local_batch_size = (
+            args.per_device_train_batch_size * args.gradient_accumulation_steps
+        )
+        args.micro_batch_size = int(args.per_device_train_batch_size * args.world_size)
+        args.batch_size = int(args.local_batch_size * args.world_size)
+        args.mini_batch_size = exact_div(
+            args.batch_size,
+            args.num_mini_batches,
+            "`batch_size` must be a multiple of `num_mini_batches`",
+        )
+        args.local_mini_batch_size = exact_div(
+            args.local_batch_size,
+            args.num_mini_batches,
+            "`local_batch_size` must be a multiple of `num_mini_batches`",
+        )
+        if args.whiten_rewards:
+            assert args.local_mini_batch_size >= 8, (
+                f"Per-rank minibatch size {args.local_mini_batch_size} is insufficient for whitening"
+            )
+        # `per_rank_rollout_batch_size` is our `args.local_batch_size`
+        # `per_rank_minibatch_size` is our `args.local_mini_batch_size`
+        args.num_total_batches = math.ceil(
+            args.total_episodes / args.batch_size
+        )  # we may train for more than `total_episodes`
+        time_tensor = torch.tensor(int(time.time()), device=accelerator.device)
+        time_int = broadcast(
+            time_tensor, 0
+        ).item()  # avoid different timestamps across processes
+        args.run_name = f"{args.exp_name}__{args.seed}__{time_int}"
+        self.local_seed = args.seed + accelerator.process_index * 100003  # Prime
+        if args.num_sample_generations > 0:
+            self.sample_generations_freq = max(
+                1, args.num_total_batches // args.num_sample_generations
+            )
+        self.local_dataloader_batch_size = args.local_batch_size
+
+        #########
+        # setup model, optimizer, and others
+        #########
+        for module in [
+            self.policy_model,
+            self.ref_model,
+            self.value_model,
+            self.reward_model,
+        ]:
+            if module is not None:
+                disable_dropout_in_model(module)
+        self.model = PolicyAndValueWrapper(self.policy_model, self.value_model)
+        self.model.config = self.policy_model.config  # needed for pushing to hub
+        self.create_optimizer_and_scheduler(
+            num_training_steps=args.num_total_batches
+        )  # note that we are calling `self.lr_scheduler.step()` manually only at the batch level
+
+        #########
+        # trainer specifics
+        #########
+        default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(
+            self.args.report_to
+        )
+        self.callbacks = (
+            default_callbacks if callbacks is None else default_callbacks + callbacks
+        )
+        self.callback_handler = CallbackHandler(
+            self.callbacks,
+            self.model,
+            self.processing_class,
+            self.optimizer,
+            self.lr_scheduler,
+        )
+        self.add_callback(
+            PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK
+        )
+        self.control = TrainerControl()
+        self.state = OnlineTrainerState(
+            is_local_process_zero=self.is_local_process_zero(),
+            is_world_process_zero=self.is_world_process_zero(),
+            stateful_callbacks=[
+                cb
+                for cb in self.callback_handler.callbacks + [self.control]
+                if isinstance(cb, ExportableState)
+            ],
+        )
+        self.current_flos = 0
+        self.hp_search_backend = None
+        self.is_deepspeed_enabled = (
+            getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        )
+        self.is_fsdp_enabled = (
+            getattr(self.accelerator.state, "fsdp_plugin", None) is not None
+        )
+        # Create distant repo and output directory if needed
+        self.hub_model_id = None
+        if self.args.push_to_hub:
+            self.init_hf_repo()
+        if self.args.should_save:
+            os.makedirs(self.args.output_dir, exist_ok=True)
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        #########
+        # setup dataloader
+        #########
+        self.dataloader = DataLoader(
+            self.train_dataset,
+            batch_size=self.local_dataloader_batch_size,
+            shuffle=True,
+            collate_fn=self.data_collator,
+            drop_last=True,  # needed; otherwise the last batch will be of ragged shape
+        )
+        # sync random states for DataLoader(shuffle=True) before `accelerator.prepare`
+        # see https://gist.github.com/vwxyzjn/2581bff1e48e185e0b85b6dfe1def79c
+        torch.manual_seed(args.seed)
+        self.model, self.optimizer, self.dataloader = accelerator.prepare(
+            self.model, self.optimizer, self.dataloader
+        )
+        torch.manual_seed(self.local_seed)  # reset the local seed again
+
+        self.eval_dataloader = DataLoader(
+            self.eval_dataset,
+            batch_size=args.per_device_eval_batch_size,
+            collate_fn=self.data_collator,
+            drop_last=True,
+        )  # no need to shuffle eval dataset
+        self.eval_dataloader = accelerator.prepare(self.eval_dataloader)
+
+        if self.is_deepspeed_enabled:
+            self.reward_model = prepare_deepspeed(
+                self.reward_model,
+                args.per_device_train_batch_size,
+                args.fp16,
+                args.bf16,
+            )
+
+            if self.ref_model is None:
+                if not self.is_peft_model:
+                    raise ValueError(
+                        "No reference model and model is not a Peft model."
+                    )
+            else:
+                self.ref_model = prepare_deepspeed(
+                    self.ref_model,
+                    args.per_device_train_batch_size,
+                    args.fp16,
+                    args.bf16,
+                )
+        else:
+            if self.ref_model is None:
+                if not self.is_peft_model:
+                    raise ValueError(
+                        "No reference model and model is not a Peft model."
+                    )
+            else:
+                self.ref_model = self.ref_model.to(self.accelerator.device)
+            self.reward_model = self.reward_model.to(self.accelerator.device)
+
+    def get_train_dataloader(self) -> DataLoader:
+        return self.dataloader
+
+    def get_eval_dataloader(self) -> DataLoader:
+        return self.eval_dataloader
+
+    @contextmanager
+    def null_ref_context(self):
+        """Context manager for handling null reference model (that is, peft adapter manipulation)."""
+        with (
+            self.accelerator.unwrap_model(self.model.policy).disable_adapter()
+            if self.is_peft_model and not self.ref_adapter_name
+            else nullcontext()
+        ):
+            if self.ref_adapter_name:
+                self.model.policy.set_adapter(self.ref_adapter_name)
+            yield
+            if self.ref_adapter_name:
+                self.model.policy.set_adapter(self.model_adapter_name or "default")
+
+    def save_model(self, output_dir: str | None = None, _internal_call: bool = False):
+        backup_model = self.model
+        self.model = self.model.policy  # save only the policy
+
+        if self.is_deepspeed_enabled:
+            backup_deepspeed = self.deepspeed
+            self.deepspeed = self.model
+
+        super().save_model(output_dir, _internal_call)
+
+        self.model = backup_model
+
+        if self.is_deepspeed_enabled:
+            self.deepspeed = backup_deepspeed
+
+    def train(self):
+        args = self.args
+        accelerator = self.accelerator
+        optimizer = self.optimizer
+        model = self.model
+        ref_policy = self.ref_model
+        reward_model = self.reward_model
+        processing_class = self.processing_class
+        dataloader = self.dataloader
+        device = accelerator.device
+
+        def repeat_generator():
+            while True:
+                yield from dataloader
+
+        iter_dataloader = iter(repeat_generator())
+        generation_config = GenerationConfig(
+            max_new_tokens=args.response_length,
+            temperature=(args.temperature + 1e-7),
+            top_k=0.0,
+            top_p=1.0,
+            do_sample=True,
+        )
+
+        accelerator.print("===training policy===")
+        start_time = time.time()
+        stats_shape = (
+            args.num_ppo_epochs,
+            args.num_mini_batches,
+            args.gradient_accumulation_steps,
+        )
+        approxkl_stats = torch.zeros(stats_shape, device=device)
+        pg_clipfrac_stats = torch.zeros(stats_shape, device=device)
+        pg_loss_stats = torch.zeros(stats_shape, device=device)
+        vf_loss_stats = torch.zeros(stats_shape, device=device)
+        vf_clipfrac_stats = torch.zeros(stats_shape, device=device)
+        entropy_stats = torch.zeros(stats_shape, device=device)
+        ratio_stats = torch.zeros(stats_shape, device=device)
+        model.train()
+
+        # trainer state initialization
+        self.state.global_step = 0
+        self.state.episode = 0
+        self.state.max_steps = args.num_total_batches
+        self.state.num_train_epochs = args.total_episodes / self.train_dataset_len
+        # Compute absolute values for logging, eval, and save if given as ratio
+        if args.logging_steps is not None:
+            if args.logging_steps < 1:
+                self.state.logging_steps = math.ceil(
+                    self.state.max_steps * args.logging_steps
+                )
+            else:
+                self.state.logging_steps = args.logging_steps
+        if args.eval_steps is not None:
+            if args.eval_steps < 1:
+                self.state.eval_steps = math.ceil(
+                    self.state.max_steps * args.eval_steps
+                )
+            else:
+                self.state.eval_steps = args.eval_steps
+        if args.save_steps is not None:
+            if args.save_steps < 1:
+                self.state.save_steps = math.ceil(
+                    self.state.max_steps * args.save_steps
+                )
+            else:
+                self.state.save_steps = args.save_steps
+        self.control = self.callback_handler.on_train_begin(
+            args, self.state, self.control
+        )
+
+        # backward compatibility
+        if self.is_deepspeed_enabled:
+            self.deepspeed = self.model
+            self.model_wrapped = self.model
+
+        for update in range(1, args.num_total_batches + 1):
+            self.state.episode += 1 * args.batch_size
+            data = next(iter_dataloader)
+            with torch.no_grad():
+                queries = data["input_ids"].to(device)
+                context_length = queries.shape[1]
+                responses = []
+                postprocessed_responses = []
+                logprobs = []
+                ref_logprobs = []
+                scores = []
+                sequence_lengths = []
+                values = []
+                with unwrap_model_for_generation(
+                    self.model,
+                    self.accelerator,
+                    gather_deepspeed3_params=self.args.ds3_gather_for_generation,
+                ) as unwrapped_model:
+                    query_responses, logitss = batch_generation(
+                        unwrapped_model.policy,
+                        queries,
+                        args.local_rollout_forward_batch_size,
+                        processing_class.pad_token_id,
+                        generation_config,
+                    )
+
+                for i in range(
+                    0, queries.shape[0], args.local_rollout_forward_batch_size
+                ):
+                    query = queries[i : i + args.local_rollout_forward_batch_size]
+                    query_response = query_responses[
+                        i : i + args.local_rollout_forward_batch_size
+                    ]
+                    response = query_response[:, context_length:]
+                    logits = logitss[i : i + args.local_rollout_forward_batch_size]
+                    logprob = selective_log_softmax(logits, response)
+                    del logits
+                    empty_cache()
+
+                    if ref_policy is None:
+                        with self.null_ref_context():
+                            ref_output = forward(
+                                model.policy,
+                                query_response,
+                                processing_class.pad_token_id,
+                            )
+                    else:
+                        ref_output = forward(
+                            ref_policy, query_response, processing_class.pad_token_id
+                        )
+                    ref_logits = ref_output.logits[:, context_length - 1 : -1]
+                    ref_logits /= args.temperature + 1e-7
+                    ref_logprob = selective_log_softmax(ref_logits, response)
+                    del ref_output, ref_logits
+                    empty_cache()
+
+                    # Response Processing 1. truncate response after the first occurrence of `stop_token_id`
+                    postprocessed_response = response
+                    if (
+                        self.stop_token_id is not None
+                    ):  # handle the edge case when stop_token_id exists but is 0
+                        postprocessed_response = truncate_response(
+                            self.stop_token_id, processing_class.pad_token_id, response
+                        )
+
+                    # Response Processing 2. run reward model on the truncated responses
+                    postprocessed_query_response = torch.cat(
+                        (query, postprocessed_response), 1
+                    )
+                    sequence_length = (
+                        first_true_indices(
+                            postprocessed_response == processing_class.pad_token_id
+                        )
+                        - 1
+                    )
+                    unwrapped_value_model = accelerator.unwrap_model(model).value_model
+                    full_value, _, _ = get_reward(
+                        unwrapped_value_model,
+                        query_response,
+                        processing_class.pad_token_id,
+                        context_length,
+                    )
+                    value = full_value[:, context_length - 1 : -1].squeeze(-1)
+                    _, score, _ = get_reward(
+                        reward_model,
+                        postprocessed_query_response,
+                        processing_class.pad_token_id,
+                        context_length,
+                    )
+
+                    responses.append(response)
+                    postprocessed_responses.append(postprocessed_response)
+                    logprobs.append(logprob)
+                    ref_logprobs.append(ref_logprob)
+                    sequence_lengths.append(sequence_length)
+                    scores.append(score)
+                    values.append(value)
+                responses = torch.cat(responses, 0)
+                postprocessed_responses = torch.cat(postprocessed_responses, 0)
+                logprobs = torch.cat(logprobs, 0)
+                ref_logprobs = torch.cat(ref_logprobs, 0)
+                sequence_lengths = torch.cat(sequence_lengths, 0)
+                scores = torch.cat(scores, 0)
+                values = torch.cat(values, 0)
+                del (logprob, ref_logprob, full_value, value, score, unwrapped_model)
+                empty_cache()
+                gc.collect()
+
+                # Response Processing 3. Filter completion. Ensure that the sample contains stop_token_id
+                # Completions not passing that filter will receive a lower score.
+                contain_eos_token = torch.any(
+                    postprocessed_responses == self.processing_class.eos_token_id,
+                    dim=-1,
+                )
+                if self.args.missing_eos_penalty is not None:
+                    scores[~contain_eos_token] -= self.args.missing_eos_penalty
+                # accelerator.print(f"{scores=}, {(contain_eos_token.sum() / len(contain_eos_token))=}")
+
+                # be very careful with `padding_mask_p1`; see https://excalidraw.com/#json=LWnzG4w2k5DjF_EOL_xPt,e2w3a-hFJ_gX5vOfeyXGTw
+                response_idxs = torch.arange(
+                    responses.shape[1], device=responses.device
+                ).repeat(responses.shape[0], 1)
+                padding_mask = response_idxs > sequence_lengths.unsqueeze(1)
+                logprobs = torch.masked_fill(logprobs, padding_mask, INVALID_LOGPROB)
+                ref_logprobs = torch.masked_fill(
+                    ref_logprobs, padding_mask, INVALID_LOGPROB
+                )
+                sequence_lengths_p1 = sequence_lengths + 1
+                padding_mask_p1 = response_idxs > (sequence_lengths_p1.unsqueeze(1))
+                values = torch.masked_fill(values, padding_mask_p1, 0)
+
+                # 4. compute rewards
+                # Formula used by http://joschu.net/blog/kl-approx.html for the k1 and k3 estimators
+                logr = ref_logprobs - logprobs
+                kl = (
+                    -logr if args.kl_estimator == "k1" else (logr.exp() - 1) - logr
+                )  # Else statement is k3
+                non_score_reward = -args.kl_coef * kl
+                rewards = non_score_reward.clone()
+                actual_start = torch.arange(rewards.size(0), device=rewards.device)
+                actual_end = torch.where(
+                    sequence_lengths_p1 < rewards.size(1),
+                    sequence_lengths_p1,
+                    sequence_lengths,
+                )
+                rewards[actual_start, actual_end] += scores
+
+                # 5. whiten rewards
+                if args.whiten_rewards:
+                    rewards = masked_whiten(
+                        rewards, mask=~padding_mask_p1, shift_mean=False
+                    )
+                    rewards = torch.masked_fill(rewards, padding_mask_p1, 0)
+
+                # 6. compute advantages and returns
+                lastgaelam = 0
+                advantages_reversed = []
+                gen_length = responses.shape[1]
+                for t in reversed(range(gen_length)):
+                    nextvalues = values[:, t + 1] if t < gen_length - 1 else 0.0
+                    delta = rewards[:, t] + args.gamma * nextvalues - values[:, t]
+                    lastgaelam = delta + args.gamma * args.lam * lastgaelam
+                    advantages_reversed.append(lastgaelam)
+                advantages = torch.stack(advantages_reversed[::-1], axis=1)
+                returns = advantages + values
+                advantages = masked_whiten(advantages, ~padding_mask)
+                advantages = torch.masked_fill(advantages, padding_mask, 0)
+                empty_cache()
+
+            # Do multiple epochs of PPO training, with a fresh random shuffle in each epoch
+            for ppo_epoch_idx in range(args.num_ppo_epochs):
+                b_inds = np.random.permutation(args.local_batch_size)
+                minibatch_idx = 0
+                for mini_batch_start in range(
+                    0, args.local_batch_size, args.local_mini_batch_size
+                ):
+                    mini_batch_end = mini_batch_start + args.local_mini_batch_size
+                    mini_batch_inds = b_inds[mini_batch_start:mini_batch_end]
+                    gradient_accumulation_idx = 0
+                    for micro_batch_start in range(
+                        0, args.local_mini_batch_size, args.per_device_train_batch_size
+                    ):
+                        with accelerator.accumulate(model):
+                            micro_batch_end = (
+                                micro_batch_start + args.per_device_train_batch_size
+                            )
+                            micro_batch_inds = mini_batch_inds[
+                                micro_batch_start:micro_batch_end
+                            ]
+                            mb_advantage = advantages[micro_batch_inds]
+                            mb_responses = responses[micro_batch_inds]
+                            mb_query_responses = query_responses[micro_batch_inds]
+                            mb_logprobs = logprobs[micro_batch_inds]
+                            mb_return = returns[micro_batch_inds]
+                            mb_values = values[micro_batch_inds]
+
+                            output, vpred_temp = forward(
+                                model, mb_query_responses, processing_class.pad_token_id
+                            )
+                            logits = output.logits[:, context_length - 1 : -1]
+                            logits /= args.temperature + 1e-7
+                            new_logprobs = selective_log_softmax(logits, mb_responses)
+                            new_logprobs = torch.masked_fill(
+                                new_logprobs,
+                                padding_mask[micro_batch_inds],
+                                INVALID_LOGPROB,
+                            )
+                            vpred = vpred_temp[:, context_length - 1 : -1].squeeze(-1)
+                            vpred = torch.masked_fill(
+                                vpred, padding_mask_p1[micro_batch_inds], 0
+                            )
+                            vpredclipped = torch.clamp(
+                                vpred,
+                                mb_values - args.cliprange_value,
+                                mb_values + args.cliprange_value,
+                            )
+                            vf_losses1 = torch.square(vpred - mb_return)
+                            vf_losses2 = torch.square(vpredclipped - mb_return)
+                            vf_loss_max = torch.max(vf_losses1, vf_losses2)
+                            vf_loss = 0.5 * masked_mean(
+                                vf_loss_max, ~padding_mask_p1[micro_batch_inds]
+                            )
+                            vf_clipfrac = masked_mean(
+                                (vf_losses2 > vf_losses1).float(),
+                                ~padding_mask_p1[micro_batch_inds],
+                            )
+                            logprobs_diff = new_logprobs - mb_logprobs
+                            ratio = torch.exp(logprobs_diff)
+                            pg_losses = -mb_advantage * ratio
+                            pg_losses2 = -mb_advantage * torch.clamp(
+                                ratio, 1.0 - args.cliprange, 1.0 + args.cliprange
+                            )
+                            pg_loss_max = torch.max(pg_losses, pg_losses2)
+                            pg_loss = masked_mean(
+                                pg_loss_max, ~padding_mask[micro_batch_inds]
+                            )
+                            loss = pg_loss + args.vf_coef * vf_loss
+                            accelerator.backward(loss)
+                            optimizer.step()
+                            optimizer.zero_grad()
+                            with torch.no_grad():
+                                pg_clipfrac = masked_mean(
+                                    (pg_losses2 > pg_losses).float(),
+                                    ~padding_mask[micro_batch_inds],
+                                )
+                                prob_dist = torch.nn.functional.softmax(logits, dim=-1)
+                                entropy = torch.logsumexp(logits, dim=-1) - torch.sum(
+                                    prob_dist * logits, dim=-1
+                                )
+                                approxkl = 0.5 * (logprobs_diff**2).mean()
+                                approxkl_stats[
+                                    ppo_epoch_idx,
+                                    minibatch_idx,
+                                    gradient_accumulation_idx,
+                                ] = approxkl
+                                pg_clipfrac_stats[
+                                    ppo_epoch_idx,
+                                    minibatch_idx,
+                                    gradient_accumulation_idx,
+                                ] = pg_clipfrac
+                                pg_loss_stats[
+                                    ppo_epoch_idx,
+                                    minibatch_idx,
+                                    gradient_accumulation_idx,
+                                ] = pg_loss
+                                vf_loss_stats[
+                                    ppo_epoch_idx,
+                                    minibatch_idx,
+                                    gradient_accumulation_idx,
+                                ] = vf_loss
+                                vf_clipfrac_stats[
+                                    ppo_epoch_idx,
+                                    minibatch_idx,
+                                    gradient_accumulation_idx,
+                                ] = vf_clipfrac
+                                entropy_stats[
+                                    ppo_epoch_idx,
+                                    minibatch_idx,
+                                    gradient_accumulation_idx,
+                                ] = entropy.mean()
+                                ratio_stats[
+                                    ppo_epoch_idx,
+                                    minibatch_idx,
+                                    gradient_accumulation_idx,
+                                ] = ratio.mean()
+                        gradient_accumulation_idx += 1
+                    minibatch_idx += 1
+                    # del everything and empty cache
+                    # fmt: off
+                    del (
+                        output, vpred_temp, logits, new_logprobs, vpred, vpredclipped,
+                        vf_losses1, vf_losses2, vf_loss, vf_clipfrac, logprobs_diff, ratio, pg_losses, pg_losses2, pg_loss_max,
+                        pg_loss, loss, pg_clipfrac, prob_dist, entropy, approxkl, mb_return,
+                        mb_advantage, mb_values, mb_responses, mb_query_responses, mb_logprobs,
+                    )
+                    # fmt: on
+                    empty_cache()
+            with torch.no_grad():
+                mean_kl = kl.sum(1).mean()
+                mean_entropy = (-logprobs).sum(1).mean()
+                mean_non_score_reward = non_score_reward.sum(1).mean()
+                rlhf_reward = mean_non_score_reward + scores.mean()
+                eps = int(self.state.episode / (time.time() - start_time))
+                metrics = {}
+                metrics["eps"] = eps
+                metrics["objective/kl"] = (
+                    self.accelerator.gather_for_metrics(mean_kl).mean().item()
+                )
+                metrics["objective/entropy"] = (
+                    self.accelerator.gather_for_metrics(mean_entropy).mean().item()
+                )
+                metrics["objective/non_score_reward"] = (
+                    self.accelerator.gather_for_metrics(mean_non_score_reward)
+                    .mean()
+                    .item()
+                )
+                metrics["objective/rlhf_reward"] = (
+                    self.accelerator.gather_for_metrics(rlhf_reward).mean().item()
+                )
+                metrics["objective/scores"] = (
+                    self.accelerator.gather_for_metrics(scores.mean()).mean().item()
+                )
+                metrics["policy/approxkl_avg"] = (
+                    self.accelerator.gather_for_metrics(approxkl_stats).mean().item()
+                )
+                metrics["policy/clipfrac_avg"] = (
+                    self.accelerator.gather_for_metrics(pg_clipfrac_stats).mean().item()
+                )
+                metrics["loss/policy_avg"] = (
+                    self.accelerator.gather_for_metrics(pg_loss_stats).mean().item()
+                )
+                metrics["loss/value_avg"] = (
+                    self.accelerator.gather_for_metrics(vf_loss_stats).mean().item()
+                )
+                metrics["val/clipfrac_avg"] = (
+                    self.accelerator.gather_for_metrics(vf_clipfrac_stats).mean().item()
+                )
+                metrics["policy/entropy_avg"] = (
+                    self.accelerator.gather_for_metrics(entropy_stats).mean().item()
+                )
+                metrics["val/ratio"] = (
+                    self.accelerator.gather_for_metrics(ratio_stats).mean().item()
+                )
+                metrics["val/ratio_var"] = (
+                    self.accelerator.gather_for_metrics(ratio_stats).var().item()
+                )
+                metrics["val/num_eos_tokens"] = (
+                    (responses == processing_class.eos_token_id).sum().item()
+                )
+                metrics["lr"] = self.lr_scheduler.get_last_lr()[0]
+                metrics["episode"] = self.state.episode
+                self.state.epoch = (
+                    self.state.episode / self.train_dataset_len
+                )  # used by self.log
+                self.state.global_step += 1
+                self.log(metrics)
+
+            self.lr_scheduler.step()
+            self.control = self.callback_handler.on_step_end(
+                args, self.state, self.control
+            )
+            if self.control.should_save:
+                self._save_checkpoint(model, trial=None)
+                self.control = self.callback_handler.on_save(
+                    self.args, self.state, self.control
+                )
+            del (
+                kl,
+                mean_kl,
+                mean_entropy,
+                mean_non_score_reward,
+                scores,
+                metrics,
+                non_score_reward,
+            )
+            empty_cache()
+            gc.collect()
+
+            if (
+                args.num_sample_generations > 0
+                and (update - 1) % self.sample_generations_freq == 0
+            ):
+                self.generate_completions(sampling=True)
+                empty_cache()
+            del (
+                query_responses,
+                responses,
+                postprocessed_responses,
+                logprobs,
+                ref_logprobs,
+                values,
+                sequence_lengths,
+                contain_eos_token,
+                sequence_lengths_p1,
+                response_idxs,
+                padding_mask,
+                padding_mask_p1,
+                rewards,
+                actual_start,
+                actual_end,
+                advantages,
+                returns,
+            )
+            empty_cache()
+
+        # HF trainer specifics
+        self.control = self.callback_handler.on_train_end(
+            args, self.state, self.control
+        )
+        if self.control.should_save:
+            self._save_checkpoint(model, trial=None)
+            self.control = self.callback_handler.on_save(
+                self.args, self.state, self.control
+            )
+
+    def generate_completions(self, sampling: bool = False):
+        args = self.args
+        processing_class = self.processing_class
+        generation_config = GenerationConfig(
+            max_new_tokens=self.args.response_length,
+            temperature=(0.01 + 1e-7),
+            top_k=0.0,
+            top_p=1.0,
+            do_sample=True,
+        )
+
+        table = defaultdict(list)
+        with unwrap_model_for_generation(
+            self.model,
+            self.accelerator,
+            gather_deepspeed3_params=self.args.ds3_gather_for_generation,
+        ) as unwrapped_model:
+            for batch in self.eval_dataloader:
+                query = batch["input_ids"]
+                with torch.no_grad():
+                    context_length = query.shape[1]
+                    query_response, _ = batch_generation(
+                        unwrapped_model.policy,
+                        query,
+                        query.shape[0],
+                        processing_class.pad_token_id,
+                        generation_config,
+                    )
+                    response = query_response[:, context_length:]
+                    postprocessed_response = response
+                    if (
+                        self.stop_token_id is not None
+                    ):  # handle the edge case when stop_token_id exists but is 0
+                        postprocessed_response = truncate_response(
+                            self.stop_token_id, processing_class.pad_token_id, response
+                        )
+                    table["query"].extend(
+                        gather_object(
+                            processing_class.batch_decode(
+                                query, skip_special_tokens=True
+                            )
+                        )
+                    )
+                    table["model response"].extend(
+                        gather_object(
+                            processing_class.batch_decode(postprocessed_response)
+                        )
+                    )
+
+                    postprocessed_query_response = torch.cat(
+                        (query, postprocessed_response), 1
+                    )
+                    _, score, _ = get_reward(
+                        self.reward_model,
+                        postprocessed_query_response,
+                        processing_class.pad_token_id,
+                        context_length,
+                    )
+                    table["score"].extend(
+                        self.accelerator.gather_for_metrics(score).float().cpu().numpy()
+                    )
+
+                if sampling:
+                    break
+        df = pd.DataFrame(table)
+
+        if self.accelerator.is_main_process:
+            if is_rich_available():
+                print_rich_table(df.iloc[0 : 0 + 5])
+            if "wandb" in args.report_to:
+                import wandb
+
+                if wandb.run is not None:
+                    wandb.log({"completions": wandb.Table(dataframe=df)})
+
+            if "comet_ml" in args.report_to:
+                log_table_to_comet_experiment(
+                    name="completions.csv",
+                    table=df,
+                )
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/trainer/prm_config.py b/src/aixpert/training/training/trl/trainer/prm_config.py
new file mode 100644
index 0000000..abc9b6f
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/prm_config.py
@@ -0,0 +1,119 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class PRMConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`PRMTrainer`].
+
+    This class includes only the parameters that are specific to PRM training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the sequences (prompt + completion) used for truncation.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt used for truncation.
+        max_completion_length (`int`, *optional*):
+            Maximum length of the completion used for truncation. The completion is the concatenation of the steps.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        step_separator (`str`, *optional*, defaults to `"\n"`):
+            Separator used to separate each step of the reasoning process.
+        train_on_last_step_only (`bool`, *optional*, defaults to `False`):
+            Whether to train only on the last step.
+        dataset_num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+    """
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-5,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    max_length: int | None = field(
+        default=1024,
+        metadata={
+            "help": "Maximum length of the sequences (prompt + completion) used for truncation."
+        },
+    )
+    max_prompt_length: int | None = field(
+        default=512,
+        metadata={"help": "Maximum length of the prompt used for truncation."},
+    )
+    max_completion_length: int | None = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion used for truncation. The completion is the concatenation of the "
+            "steps."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to disable dropout in the model and reference model."
+        },
+    )
+    step_separator: str = field(
+        default="\n",
+        metadata={
+            "help": "Separator used to separate each step of the reasoning process."
+        },
+    )
+    train_on_last_step_only: bool = field(
+        default=False,
+        metadata={"help": "Whether to train only on the last step."},
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        super().__post_init__()
diff --git a/src/aixpert/training/training/trl/trainer/prm_trainer.py b/src/aixpert/training/training/trl/trainer/prm_trainer.py
new file mode 100644
index 0000000..cf10dd2
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/prm_trainer.py
@@ -0,0 +1,323 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import textwrap
+import warnings
+from collections.abc import Callable
+from itertools import chain
+from pathlib import Path
+
+import torch
+from accelerate import PartialState
+from datasets import Dataset, features
+from torch import nn
+from transformers import (
+    BaseImageProcessor,
+    DataCollator,
+    DataCollatorForTokenClassification,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+)
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalPrediction
+from transformers.utils import is_peft_available
+
+from ..models import prepare_peft_model
+from .base_trainer import BaseTrainer
+from .prm_config import PRMConfig
+from .utils import compute_accuracy, disable_dropout_in_model
+
+
+if is_peft_available():
+    from peft import PeftModel
+
+
+class PRMTrainer(BaseTrainer):
+    """
+    Initialize PRMTrainer.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an `AutoModelForTokenClassification`.
+        args ([`PRMConfig`]):
+            The arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`~transformers.DataCollatorForTokenClassification`]) will be used which will pad the sequences to the
+            maximum length of the sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        model_init (`Callable[[], transformers.PreTrainedModel]`):
+            The model initializer to use for training. If None is specified, the default model initializer will be
+            used.
+        compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional* defaults to `compute_accuracy`):
+            The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`)
+            will be used.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        peft_config (`dict`, defaults to `None`):
+            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
+            a PEFT model.
+    """
+
+    _tag_names = ["trl", "prm"]
+    _name = "PRM"
+    _paper = {
+        "title": "Solving math word problems with process-and outcome-based feedback",
+        "id": "2211.14275",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{uesato2022solving,
+                title        = {{Solving Math Word Problems With Process- and Outcome-Based Feedback}},
+                author       = {Uesato, Jonathan and Kushman, Nate and Kumar, Ramana and Song, Francis and Siegel, Noah and Wang, Lisa and Creswell, Antonia and Irving, Geoffrey and Higgins, Irina},
+                year         = 2022,
+                journal      = {arXiv preprint arXiv:2211.14275}
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: PreTrainedModel | nn.Module | None = None,
+        args: PRMConfig | None = None,
+        data_collator: DataCollator | None = None,
+        train_dataset: Dataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin
+        | None = None,
+        model_init: Callable[[], PreTrainedModel] | None = None,
+        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
+            None,
+            None,
+        ),
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+        peft_config: dict | None = None,
+    ):
+        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+            warnings.warn(
+                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
+                "it and want it to remain, please share your comments here: "
+                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
+                "TRL_EXPERIMENTAL_SILENCE=1."
+            )
+        if peft_config is not None or (
+            is_peft_available() and isinstance(model, PeftModel)
+        ):
+            model = prepare_peft_model(model, peft_config, args)
+
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+
+        if compute_metrics is None:
+            compute_metrics = compute_accuracy
+
+        if data_collator is None:
+            if processing_class is None:
+                raise ValueError(
+                    "A processing_class must be specified when using the default DataCollatorForTokenClassification"
+                )
+            data_collator = DataCollatorForTokenClassification(processing_class)
+
+        if "input_ids" not in train_dataset.column_names:
+            with PartialState().main_process_first():
+                fn_kwargs = {
+                    "tokenizer": processing_class,
+                    "step_separator": args.step_separator,
+                    "max_length": args.max_length,
+                    "max_prompt_length": args.max_prompt_length,
+                    "max_completion_length": args.max_completion_length,
+                    "train_on_last_step_only": args.train_on_last_step_only,
+                }
+                train_fn_kwargs = {**fn_kwargs, "is_eval": False}
+                train_dataset = train_dataset.map(
+                    self.tokenize_row,
+                    fn_kwargs=train_fn_kwargs,
+                    num_proc=args.dataset_num_proc,
+                    remove_columns=train_dataset.features,
+                    desc="Tokenizing train dataset",
+                    features=features.Features(  # needed to avoid map to cast labels to bool
+                        {
+                            "labels": features.Sequence(features.Value("int64")),
+                            "input_ids": features.Sequence(features.Value("int64")),
+                        }
+                    ),
+                )
+
+                eval_fn_kwargs = {**fn_kwargs, "is_eval": True}
+                if eval_dataset is not None:
+                    eval_dataset = eval_dataset.map(
+                        self.tokenize_row,
+                        fn_kwargs=eval_fn_kwargs,
+                        num_proc=args.dataset_num_proc,
+                        remove_columns=eval_dataset.features,
+                        desc="Tokenizing eval dataset",
+                        features=features.Features(  # needed to avoid map to cast labels to bool
+                            {
+                                "labels": features.Sequence(features.Value("int64")),
+                                "input_ids": features.Sequence(features.Value("int64")),
+                            }
+                        ),
+                    )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+    @staticmethod
+    def tokenize_row(
+        features,
+        tokenizer,
+        step_separator,
+        max_length,
+        max_prompt_length,
+        max_completion_length,
+        train_on_last_step_only,
+        is_eval,
+    ):
+        r"""
+        Tokenize a row of the dataset.
+
+        Args:
+            features (`dict[str, str]`):
+                Row of the dataset, should contain the keys `"prompt"`, `"completions"`, and `"labels"`.
+            tokenizer ([`~transformers.PreTrainedTokenizerBase`]):
+                Tokenizer used to process the data.
+            step_separator (`str`):
+                Separator between steps in the completion.
+            max_length (`int` or `None`):
+               Maximum length of the sequences (prompt + completion). If `None`, the sequences are not truncated.
+            max_prompt_length (`int` or `None`):
+                Maximum length of the prompt. If `None`, the prompt is not truncated.
+            max_completion_length (`int` or `None`):
+                Maximum length of the completion sequences. If `None`, the completion sequences are not truncated.
+            train_on_last_step_only (`bool`):
+                Whether to train only on the last step. If `True`, the labels are `-100` for all tokens except the last
+                token of the completion.
+            is_eval (`bool`):
+                Whether the function is used to tokenize samples from a training or an evaluation dataset. Used only if
+                `train_on_last_step_only` is set to `True`.
+
+        Returns
+        -------
+            `dict[str, list[int]]`:
+                Tokenized sequences with the keys `"input_ids"`, and `"labels".
+
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
+        >>> features = {
+        ...     "prompt": "Which number is larger, 9.8 or 9.11?",
+        ...     "completions": ["11 is greater than 8.", "Hence, 9.11 > 9.8."],
+        ...     "labels": [True, False],
+        ... }
+        >>> PRMTrainer.tokenize_row(
+        ...     features, tokenizer, "\n", max_completion_length=None, train_on_last_step_only=False, is_eval=False
+        ... )
+        {'input_ids': [23085, 1372, 374, 8131, 11, 220, 24, 13, 23, 476, 220, 24, 13, 16, 16, 30, 16, 16, 374, 7046, 1091, 220, 23, 13, 198, 39, 763, 11, 220, 24, 13, 16, 16, 861, 220, 24, 13, 23, 13, 198],
+         'labels': [-100, -100, -100, -100, -100, -100, -100, -100, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 0]}
+        ```
+        """
+        # Tokenize the prompt and completions
+        prompt_ids = tokenizer(features["prompt"], add_special_tokens=False)[
+            "input_ids"
+        ]
+        completions_ids = [
+            tokenizer(completion, add_special_tokens=False)["input_ids"]
+            for completion in features["completions"]
+        ]
+        if train_on_last_step_only and not is_eval:
+            labels = [-100] * (len(features["labels"]) - 1) + [
+                int(features["labels"][-1])
+            ]
+        else:
+            labels = [int(label) for label in features["labels"]]
+
+        # Get the ID of the separator token and add it to the completions
+        separator_ids = tokenizer.encode(step_separator, add_special_tokens=False)
+        completions_ids = [completion + separator_ids for completion in completions_ids]
+
+        # Create the label
+        labels = [
+            [-100] * (len(completion) - 1) + [label]
+            for completion, label in zip(completions_ids, labels, strict=True)
+        ]
+
+        # Join the completions and labels steps
+        completion_ids = list(chain(*completions_ids))
+        labels = list(chain(*labels))
+
+        if tokenizer.bos_token_id is not None:
+            prompt_ids = [tokenizer.bos_token_id] + prompt_ids
+
+        # Truncate prompt and completion sequences
+        if max_prompt_length is not None:
+            prompt_ids = prompt_ids[-max_prompt_length:]
+        if max_completion_length is not None:
+            completion_ids = completion_ids[:max_completion_length]
+            labels = labels[:max_completion_length]
+
+        input_ids = prompt_ids + completion_ids
+        labels = [-100] * len(prompt_ids) + labels
+
+        if max_length is not None:
+            input_ids = input_ids[:max_length]
+            labels = labels[:max_length]
+
+        return {"input_ids": input_ids, "labels": labels}
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/trainer/reward_config.py b/src/aixpert/training/training/trl/trainer/reward_config.py
new file mode 100644
index 0000000..9b3a1f8
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/reward_config.py
@@ -0,0 +1,174 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class RewardConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`RewardTrainer`].
+
+    This class includes only the parameters that are specific to Reward training. For a full list of training
+    arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this
+    class may differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        > Parameters that control the model
+
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
+            argument of the [`RewardTrainer`] is provided as a string. If you're training a MoE architecture and want
+            to include the load balancing/auxilliary loss as a part of the final loss, remember to set
+            `output_router_logits=True` in this dictionary.
+        chat_template_path (`str`, *optional*):
+            If specified, sets the model's chat template. This can either be the path to a tokenizer (local directory
+            or Hugging Face Hub model) or a direct path to a Jinja template file. When using a Jinja file, you must
+            ensure that any special tokens referenced in the template are added to the tokenizer and that the model's
+            embedding layer is resized accordingly.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+
+        > Parameters that control the data preprocessing
+
+        dataset_num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+        eos_token (`str`, *optional*):
+            Token used to indicate the end of a turn or sequence. If `None`, it defaults to
+            `processing_class.eos_token`.
+        pad_token (`str`, *optional*):
+            Token used for padding. If `None`, it defaults to `processing_class.pad_token`, or if that is also `None`,
+            it falls back to `processing_class.eos_token`.
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the tokenized sequence. Samples are filtered out if either chosen or rejected sequence
+            exceeds this value. If `None`, no filtering is applied.
+        pad_to_multiple_of (`int`, *optional*):
+            If set, the sequences will be padded to a multiple of this value.
+
+        > Parameters that control the training
+
+        center_rewards_coefficient (`float`, *optional*):
+            Coefficient to incentivize the reward model to output mean-zero rewards (proposed by
+            https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.
+        activation_offloading (`bool`, *optional*, defaults to `False`):
+            Whether to offload the activations to the CPU.
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-4,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    # Parameters that control the model
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments for `AutoModelForCausalLM.from_pretrained`, used when the `model` argument of "
+            "the `RewardTrainer` is provided as a string."
+        },
+    )
+    chat_template_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "If specified, sets the model's chat template. This can either be the path to a tokenizer (local "
+            "directory or Hugging Face Hub model) or a direct path to a Jinja template file. When using a Jinja file, "
+            "you must ensure that any special tokens referenced in the template are added to the tokenizer and "
+            "that the model's embedding layer is resized accordingly."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+
+    # Parameters that control the data preprocessing
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    eos_token: str | None = field(
+        default=None,
+        metadata={
+            "help": "Token used to indicate the end of a turn or sequence. If `None`, it defaults to `processing_class.eos_token`."
+        },
+    )
+    pad_token: str | None = field(
+        default=None,
+        metadata={
+            "help": "Token used for padding. If `None`, it defaults to `processing_class.pad_token`, or if that "
+            "is also `None`, it falls back to `processing_class.eos_token`."
+        },
+    )
+    max_length: int | None = field(
+        default=1024,
+        metadata={
+            "help": "Maximum length of the tokenized sequence. Sequences longer than `max_length` are truncated from"
+            "the right. If `None`, no truncation is applied."
+        },
+    )
+    pad_to_multiple_of: int | None = field(
+        default=None,
+        metadata={
+            "help": "If set, the sequences will be padded to a multiple of this value."
+        },
+    )
+
+    # Parameters that control the training
+    center_rewards_coefficient: float | None = field(
+        default=None,
+        metadata={
+            "help": "Coefficient to incentivize the reward model to output mean-zero rewards (proposed by "
+            "https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`."
+        },
+    )
+    activation_offloading: bool = field(
+        default=False,
+        metadata={"help": "Whether to offload the activations to the CPU."},
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+        super().__post_init__()
diff --git a/src/aixpert/training/training/trl/trainer/reward_trainer.py b/src/aixpert/training/training/trl/trainer/reward_trainer.py
new file mode 100644
index 0000000..05acfa8
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/reward_trainer.py
@@ -0,0 +1,693 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import logging
+import os
+import re
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import torch
+import transformers
+from accelerate import PartialState
+from accelerate.logging import get_logger
+from datasets import Dataset, IterableDataset
+from torch import nn
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollator,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+)
+from transformers.data.data_collator import DataCollatorMixin
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalPrediction
+from transformers.utils import is_peft_available
+
+from ..data_utils import is_conversational
+from ..models import (
+    clone_chat_template,
+    get_act_offloading_ctx_manager,
+    prepare_peft_model,
+)
+from .base_trainer import BaseTrainer
+from .reward_config import RewardConfig
+from .utils import (
+    disable_dropout_in_model,
+    get_config_model_id,
+    pad,
+    remove_none_values,
+)
+
+
+if is_peft_available():
+    from peft import PeftConfig, PeftModel
+
+
+logger = get_logger(__name__)
+
+
+# AutoModelForSequenceClassification adds a new classification head when loading a CausalLM. That head is randomly
+# initialized and triggers a harmless warning about uninitialized weights. We suppress just that specific warning to
+# avoid confusing users.
+@contextmanager
+def suppress_from_pretrained_warning(logger: logging.Logger):
+    pattern = re.compile(
+        r"^Some weights of \S+ were not initialized from the model checkpoint at \S+ and are newly initialized: "
+        r"\[.*\]\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and "
+        r"inference\.$"
+    )
+
+    class _Filter(logging.Filter):
+        def filter(self, record: logging.LogRecord) -> bool:
+            return not pattern.search(record.getMessage())
+
+    f = _Filter()
+    logger.addFilter(f)
+    try:
+        yield
+    finally:
+        logger.removeFilter(f)
+
+
+@dataclass
+class DataCollatorForPreference(DataCollatorMixin):
+    """
+    Data collator used for preference data. Inputs are dynamically padded to the maximum length of a batch.
+
+    This collator expects each example in the input list to be a dictionary containing the `"chosen_input_ids"` and
+    `"rejected_input_ids"` keys. The collator returns a dictionary containing the following keys:
+    - `"input_ids"`: Tensor of input IDs, padded to the maximum length of the batch. The first half of the batch
+        corresponds to the `"chosen_input_ids"` and the second half to the `"rejected_input_ids"`.
+    - `"attention_mask"`: Tensor of attention mask, padded to the maximum length of the batch.
+
+    Optionally, the examples can contain a `"margin"` key, in which case the returned dictionary will also contain a
+    `"margin"` key with a tensor of margins.
+
+    Args:
+        pad_token_id (`int`):
+            Token ID to use for padding.
+        pad_to_multiple_of (`int`, *optional*):
+            If set, the sequences will be padded to a multiple of this value.
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            Type of Tensor to return. Only `"pt"` is currently supported.
+
+    Examples
+    --------
+    ```python
+    >>> from trl.trainer.reward_trainer import DataCollatorForPreference
+
+    >>> collator = DataCollatorForPreference(pad_token_id=0)
+    >>> examples = [
+    ...     {"chosen_input_ids": [1, 2, 3], "rejected_input_ids": [4, 5]},
+    ...     {"chosen_input_ids": [6, 7], "rejected_input_ids": [8]},
+    ... ]
+    >>> collator(examples)
+    {'input_ids': tensor([[1, 2, 3],
+                          [6, 7, 0],
+                          [4, 5, 0],
+                          [8, 0, 0]]),
+     'attention_mask': tensor([[1, 1, 1],
+                               [1, 1, 0],
+                               [1, 1, 0],
+                               [1, 0, 0]])}
+
+    >>> examples = [
+    ...     {"chosen_input_ids": [1, 2, 3], "rejected_input_ids": [4, 5], "margin": 0.5},
+    ...     {"chosen_input_ids": [6, 7], "rejected_input_ids": [8], "margin": 0.0},
+    ... ]
+    >>> collator(examples)
+    {'input_ids': tensor([[1, 2, 3],
+                          [6, 7, 0],
+                          [4, 5, 0],
+                          [8, 0, 0]]),
+     'attention_mask': tensor([[1, 1, 1],
+                               [1, 1, 0],
+                               [1, 1, 0],
+                               [1, 0, 0]]),
+     'margin': tensor([0.5, 0.0])}
+    ```
+    """
+
+    pad_token_id: int
+    pad_to_multiple_of: int | None = None
+    return_tensors: str = "pt"
+
+    def torch_call(self, examples: list[dict[str, Any]]) -> dict[str, Any]:
+        # Convert to tensor
+        chosen_input_ids = [
+            torch.tensor(example["chosen_input_ids"]) for example in examples
+        ]
+        rejected_input_ids = [
+            torch.tensor(example["rejected_input_ids"]) for example in examples
+        ]
+        if "margin" in examples[0]:
+            margins = torch.tensor(
+                [example["margin"] for example in examples], dtype=torch.float
+            )
+        input_ids = chosen_input_ids + rejected_input_ids
+        attention_mask = [torch.ones_like(ids) for ids in input_ids]
+
+        output = {}
+
+        # Pad
+        output["input_ids"] = pad(
+            input_ids,
+            padding_value=self.pad_token_id,
+            padding_side="right",
+            pad_to_multiple_of=self.pad_to_multiple_of,
+        )
+        output["attention_mask"] = pad(
+            attention_mask,
+            padding_value=0,
+            padding_side="right",
+            pad_to_multiple_of=self.pad_to_multiple_of,
+        )
+        if "margin" in examples[0]:
+            output["margin"] = margins
+        return output
+
+
+class RewardTrainer(BaseTrainer):
+    """
+    Trainer for Outcome-supervised Reward Models (ORM).
+
+    This class is a wrapper around the [`~transformers.Trainer`] class and inherits all of its attributes and methods.
+
+    Example:
+
+    ```python
+    from trl import RewardTrainer
+    from datasets import load_dataset
+
+    dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
+
+    trainer = RewardTrainer(model="Qwen/Qwen2.5-0.5B-Instruct", train_dataset=dataset)
+    trainer.train()
+    ```
+
+    Args:
+        model (`str | PreTrainedModel`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using `AutoModelForSequenceClassification.from_pretrained` with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A sequence classification [`~transformers.PreTrainedModel`] object.
+        args ([`RewardConfig`], *optional*):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
+            Will default to [`~trainer.reward_trainer.DataCollatorForPreference`].
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. This trainer supports [preference](#preference) type (both implicit and
+            explicit prompt). The format of the samples can be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+
+            The trainer also supports processed datasets (tokenized) as long as they contain an `chosen_input_ids` and
+            `rejected_input_ids` fields.
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Dataset | IterableDataset]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*):
+            Tokenizer used to process the data. If `None`, the tokenizer is loaded from the model's name with
+            [`~transformers.AutoTokenizer.from_pretrained`]. A padding token, `processing_class.pad_token`, must be
+            set. If the processing class has not set a padding token, `processing_class.eos_token` will be used as the
+            default.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function that will be used to compute metrics at evaluation. Must take a
+            [`~transformers.EvalPrediction`] and return a dictionary string to metric values. When passing
+            [`RewardConfig`] with `batch_eval_metrics` set to `True`, your `compute_metrics` function must take a
+            boolean `compute_result` argument. This will be triggered after the last eval batch to signal that the
+            function needs to calculate and return the global summary statistics rather than accumulating the
+            batch-level statistics.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+            in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your
+            model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`.
+        optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*):
+            A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
+            `args`. Incompatible with the `optimizers` argument.
+
+            Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before
+            initializing the Trainer.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
+            A function that preprocess the logits right before caching them at each evaluation step. Must take two
+            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
+            by this function will be reflected in the predictions received by `compute_metrics`.
+
+            Note that the labels (second parameter) will be `None` if the dataset does not have them.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped. Note that if the loaded
+            model is a causal LM, it's highly recommended to set `modules_to_save=["score"]` in the PEFT configuration
+            to ensure that the reward head is properly trained.
+    """
+
+    _tag_names = ["trl", "reward-trainer"]
+    _name = "Reward"
+    _template_file = "rm_model_card.md"
+
+    def __init__(
+        self,
+        model: str | PreTrainedModel,
+        args: RewardConfig | None = None,
+        data_collator: DataCollator | None = None,
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase | None = None,
+        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[
+            torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
+        ] = (None, None),
+        optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]]
+        | None = None,
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+        peft_config: "PeftConfig | None" = None,
+    ):
+        # Args
+        if args is None:
+            model_name = (
+                model if isinstance(model, str) else get_config_model_id(model.config)
+            )
+            model_name = model_name.split("/")[-1]
+            args = RewardConfig(f"{model_name}-Reward")
+
+        # Model
+        model_init_kwargs = args.model_init_kwargs or {}
+        if isinstance(model, str):
+            model_id = model
+            dtype = model_init_kwargs.get("dtype")
+            if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
+                pass  # dtype is already a torch.dtype or "auto" or None
+            elif isinstance(dtype, str) and dtype in ["bfloat16", "float16", "float32"]:
+                model_init_kwargs["dtype"] = getattr(torch, dtype)
+            else:
+                raise ValueError(
+                    "Invalid `dtype` passed to `RewardConfig`. Expected either 'auto' or a string representing "
+                    f"a valid `torch.dtype` (e.g., 'float32'), but got {dtype}."
+                )
+            with suppress_from_pretrained_warning(transformers.modeling_utils.logger):
+                model = AutoModelForSequenceClassification.from_pretrained(
+                    model_id, num_labels=1, **model_init_kwargs
+                )
+        else:
+            model_id = get_config_model_id(model.config)
+            if args.model_init_kwargs is not None:
+                logger.warning(
+                    "You passed `model_init_kwargs` to the `RewardConfig`, but your model is already instantiated. "
+                    "The `model_init_kwargs` will be ignored."
+                )
+
+        # Processing class
+        if processing_class is None:
+            processing_class = AutoTokenizer.from_pretrained(model_id)
+
+        # Handle pad token for processors or tokenizers
+        if args.eos_token is not None:
+            eos_token = args.eos_token
+            eos_token_id = processing_class.convert_tokens_to_ids(eos_token)
+            if eos_token_id is None:
+                raise ValueError(
+                    f"The specified `eos_token` ('{eos_token}') is not found in the vocabulary of the given "
+                    f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `eos_token` exists "
+                    "in the vocabulary before using it as an EOS token."
+                )
+            processing_class.eos_token_id = eos_token_id
+
+        if args.chat_template_path is not None:
+            if os.path.isfile(
+                args.chat_template_path
+            ) and args.chat_template_path.endswith((".jinja", ".j2")):
+                with open(
+                    args.chat_template_path, encoding="utf-8"
+                ) as chat_template_file:
+                    processing_class.chat_template = chat_template_file.read()
+                added_tokens = []
+            else:
+                model, processing_class, added_tokens = clone_chat_template(
+                    model, processing_class, args.chat_template_path
+                )
+        else:
+            added_tokens = []
+
+        # PEFT configuration and model wrapping
+        if peft_config is not None:
+            if added_tokens:
+                # Ensure that the added tokens are trainable
+                if peft_config.trainable_token_indices is None:
+                    peft_config.trainable_token_indices = {"embed_tokens": added_tokens}
+                elif "embed_tokens" not in peft_config.trainable_token_indices:
+                    peft_config.trainable_token_indices["embed_tokens"] = added_tokens
+                else:
+                    peft_config.trainable_token_indices["embed_tokens"].extend(
+                        added_tokens
+                    )
+
+                # Ensure that the lm_head is trainable
+                if (
+                    peft_config.modules_to_save is None
+                    or "lm_head" not in peft_config.modules_to_save
+                ):
+                    logger.warning(
+                        "Cloning chat template added new tokens to the tokenizer, but 'lm_head' is not in PEFT's "
+                        "`modules_to_save`. As a result, the model may not learn to generate outputs with these new "
+                        "tokens, leading to degraded generation quality. To fix this, add "
+                        "`modules_to_save=['lm_head']` to your PEFT configuration."
+                    )
+
+                    if peft_config.modules_to_save is None:
+                        peft_config.modules_to_save = ["lm_head"]
+                    else:
+                        peft_config.modules_to_save.append("lm_head")
+
+        if peft_config is not None or (
+            is_peft_available() and isinstance(model, PeftModel)
+        ):
+            model = prepare_peft_model(model, peft_config, args)
+
+        # Disable dropout in the model
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+
+        # Pad token (needed for SequenceClassification models)
+        # If not provided, use the one from the processing class or the eos token if the processing class does not have
+        # a pad token.
+        pad_token = (
+            args.pad_token or processing_class.pad_token or processing_class.eos_token
+        )
+        pad_token_id = processing_class.convert_tokens_to_ids(pad_token)
+        if pad_token_id is None:
+            raise ValueError(
+                f"The specified `pad_token` ('{pad_token}') is not found in the vocabulary of the given "
+                f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `pad_token` exists "
+                "in the vocabulary before using it as a padding token."
+            )
+        model.config.pad_token_id = pad_token_id
+        processing_class.pad_token_id = pad_token_id
+
+        # Data collator
+        if data_collator is None:
+            data_collator = DataCollatorForPreference(
+                pad_token_id=pad_token_id,
+                pad_to_multiple_of=args.pad_to_multiple_of,
+            )
+
+        # Dataset
+        train_dataset = self._prepare_dataset(
+            train_dataset, processing_class, args, "train"
+        )
+        if eval_dataset is not None:
+            if isinstance(eval_dataset, dict):
+                eval_dataset = {
+                    key: self._prepare_dataset(dataset, processing_class, args, key)
+                    for key, dataset in eval_dataset.items()
+                }
+            else:
+                eval_dataset = self._prepare_dataset(
+                    eval_dataset, processing_class, args, "eval"
+                )
+
+        # Initialize the metrics
+        self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)}
+        self._total_train_tokens = 0
+
+        # Initialize the Trainer. Parent class will handle:
+        # - DeepSpeed configuration (through create_accelerator_and_postprocess)
+        # - FSDP setup
+        # - Distributed training setup
+        # - Optimizer and scheduler creation
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # During evaluation, Trainer calls compute_loss() only if can_return_loss is True and label_names is empty.
+        self.can_return_loss = True
+        self.label_names = []
+
+        # Initialize activation offloading context
+        if self.args.activation_offloading:
+            self.maybe_activation_offload_context = get_act_offloading_ctx_manager(
+                model=self.model
+            )
+        else:
+            self.maybe_activation_offload_context = contextlib.nullcontext()
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+
+    def _prepare_dataset(
+        self,
+        dataset: Dataset | IterableDataset,
+        processing_class: PreTrainedTokenizerBase,
+        args: RewardConfig,
+        dataset_name: str,
+    ) -> Dataset | IterableDataset:
+        # Tabular backends like Arrow/Parquet insert `None` for mismatched keys in nested structures. Clean them from
+        # sampled data.
+        if isinstance(
+            dataset, Dataset
+        ):  # IterableDataset does not support `with_transform`
+            dataset = dataset.with_transform(remove_none_values)
+
+        # If the dataset is already preprocessed (tokenized), skip the processing steps.
+        column_names = list(next(iter(dataset)).keys())
+        is_processed = (
+            "chosen_input_ids" in column_names and "rejected_input_ids" in column_names
+        )
+
+        # Build the kwargs for the `map` function
+        map_kwargs = {}
+        if isinstance(dataset, Dataset):  # IterableDataset does not support num_proc
+            map_kwargs["num_proc"] = args.dataset_num_proc
+
+        with PartialState().main_process_first():
+            if not is_processed:
+                # Add EOS token to the end of the sequences if needed
+                first_example = next(iter(dataset))
+                if not is_conversational(first_example):
+                    if isinstance(
+                        dataset, Dataset
+                    ):  # `IterableDataset.map` does not support `desc`
+                        map_kwargs["desc"] = f"Adding EOS to {dataset_name} dataset"
+
+                    def add_eos(example, eos_token):
+                        if not example["chosen"].endswith(eos_token):
+                            example["chosen"] = example["chosen"] + eos_token
+                        if "rejected" in example and not example["rejected"].endswith(
+                            eos_token
+                        ):
+                            example["rejected"] = example["rejected"] + eos_token
+                        return example
+
+                    dataset = dataset.map(
+                        add_eos,
+                        fn_kwargs={"eos_token": processing_class.eos_token},
+                        **map_kwargs,
+                    )
+
+                # Tokenize the dataset
+                if isinstance(
+                    dataset, Dataset
+                ):  # `IterableDataset.map` does not support `desc`
+                    map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
+
+                def tokenize_fn(example, processing_class):
+                    if "prompt" in example:  # explicit prompt case
+                        example["chosen"] = example["prompt"] + example["chosen"]
+                        example["rejected"] = example["prompt"] + example["rejected"]
+
+                    if is_conversational(example):
+                        chosen_input_ids = processing_class.apply_chat_template(
+                            example["chosen"],
+                            tools=example.get("tools"),
+                            return_dict=True,
+                            **example.get("chat_template_kwargs", {}),
+                        )["input_ids"]
+                        rejected_input_ids = processing_class.apply_chat_template(
+                            example["rejected"],
+                            tools=example.get("tools"),
+                            return_dict=True,
+                            **example.get("chat_template_kwargs", {}),
+                        )["input_ids"]
+                        output = {
+                            "chosen_input_ids": chosen_input_ids,
+                            "rejected_input_ids": rejected_input_ids,
+                        }
+                    else:
+                        output = {
+                            "chosen_input_ids": processing_class(
+                                text=example["chosen"]
+                            )["input_ids"],
+                            "rejected_input_ids": processing_class(
+                                text=example["rejected"]
+                            )["input_ids"],
+                        }
+                    return output
+
+                dataset = dataset.map(
+                    tokenize_fn,
+                    fn_kwargs={"processing_class": processing_class},
+                    **map_kwargs,
+                )
+
+            # Filter samples that are longer than `max_length`
+            if args.max_length is not None:
+                if isinstance(
+                    dataset, Dataset
+                ):  # `IterableDataset.map` does not support `desc`
+                    map_kwargs["desc"] = (
+                        f"Filtering {dataset_name} >{args.max_length} tokens"
+                    )
+                dataset = dataset.filter(
+                    lambda example: len(example["chosen_input_ids"]) <= args.max_length
+                    and len(example["rejected_input_ids"]) <= args.max_length,
+                    **map_kwargs,
+                )
+
+        return dataset
+
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs (usually, "input_ids"
+        # and "attention_mask").
+        if self._signature_columns is None:
+            self._signature_columns = [
+                "chosen_input_ids",
+                "rejected_input_ids",
+                "margin",
+            ]
+
+    def compute_loss(
+        self,
+        model: nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs: bool = False,
+        num_items_in_batch: torch.Tensor | None = None,
+    ):
+        """
+        Compute training loss and additionally compute token accuracies
+        """
+        mode = "train" if self.model.training else "eval"
+
+        # If not set, defaults from model config and may warn since cache isn't compatible with gradient checkpointing
+        inputs["use_cache"] = False
+        outputs = model(**inputs)
+
+        # Split the rewards into chosen and rejected
+        rewards_chosen, rewards_rejected = torch.chunk(
+            outputs.logits.squeeze(-1), chunks=2
+        )
+
+        # Calculate loss, optionally modulate with margin
+        if "margin" in inputs:
+            loss = -nn.functional.logsigmoid(
+                rewards_chosen - rewards_rejected - inputs["margin"]
+            ).mean()
+        else:
+            loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean()
+
+        if self.args.center_rewards_coefficient is not None:
+            loss += self.args.center_rewards_coefficient * torch.mean(
+                (rewards_chosen + rewards_rejected) ** 2
+            )
+
+        if mode == "train":
+            num_tokens_in_batch = (
+                self.accelerator.gather_for_metrics(inputs["attention_mask"].sum())
+                .sum()
+                .item()
+            )
+            self._total_train_tokens += num_tokens_in_batch
+        self._metrics[mode]["num_tokens"] = [self._total_train_tokens]
+
+        # Compute min, mean, max, accuracy and margin
+        with torch.no_grad():
+            all_rewards = self.accelerator.gather(outputs.logits)
+            self._metrics[mode]["min_reward"].append(all_rewards.min().item())
+            self._metrics[mode]["mean_reward"].append(all_rewards.mean().item())
+            self._metrics[mode]["max_reward"].append(all_rewards.max().item())
+
+            mean_accuracy = (rewards_chosen > rewards_rejected).float().mean()
+            mean_accuracy = (
+                self.accelerator.gather_for_metrics(mean_accuracy).mean().item()
+            )
+            self._metrics[mode]["accuracy"].append(mean_accuracy)
+
+            mean_margin = (rewards_chosen - rewards_rejected).mean()
+            mean_margin = self.accelerator.gather_for_metrics(mean_margin).mean()
+            self._metrics[mode]["margin"].append(mean_margin.item())
+
+        return (loss, outputs) if return_outputs else loss
+
+    # Override training step to add activation offloading context.
+    def training_step(self, *args, **kwargs):
+        with self.maybe_activation_offload_context:
+            return super().training_step(*args, **kwargs)
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        mode = "train" if self.model.training else "eval"
+        metrics = {
+            key: sum(val) / len(val) for key, val in self._metrics[mode].items()
+        }  # average the metrics
+
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if mode == "eval":
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+
+        logs.update(metrics)
+        super().log(logs, start_time)
+        self._metrics[mode].clear()
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/trainer/rloo_config.py b/src/aixpert/training/training/trl/trainer/rloo_config.py
new file mode 100644
index 0000000..e58111f
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/rloo_config.py
@@ -0,0 +1,615 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class RLOOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`RLOOTrainer`].
+
+    This class includes only the parameters that are specific to RLOO training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        > Parameters that control the model and reference model
+
+        model_init_kwargs (`str`, `dict[str, Any]`, *optional*):
+            Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
+            argument of the [`RLOOTrainer`] is provided as a string.
+        disable_dropout (`bool`, *optional*, defaults to `False`):
+            Whether to disable dropout in the model. This is useful for training with a reference model, as it prevents
+            the model from generating different logprobs for the same input.
+
+        > Parameters that control the data preprocessing
+
+        remove_unused_columns (`bool`, *optional*, defaults to `False`):
+            Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that
+            requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left.
+        num_generations (`int` or `None`, *optional*, defaults to `2`):
+            Number of generations per prompt to sample. The effective batch size (num_processes * per_device_batch_size
+            * gradient_accumulation_steps) must be evenly divisible by this value.
+        max_completion_length (`int` or `None`, *optional*, defaults to `256`):
+            Maximum length of the generated completion.
+        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
+            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+            improving generation speed. However, disabling this option allows training models that exceed the VRAM
+            capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
+            with vLLM generation.
+        shuffle_dataset (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the training dataset.
+
+        > Parameters that control generation
+
+        generation_batch_size: (`int`, *optional*):
+            Batch size to use for generation. If `None`, it defaults to the effective training batch size:
+            `per_device_train_batch_size * num_processes * steps_per_generation`. In other words, there is one
+            generation batch processed per optimization step. Mutually exclusive with `steps_per_generation`.
+        steps_per_generation: (`int`, *optional*):
+            Number of steps per generation. If `None`, it defaults to `gradient_accumulation_steps`. Mutually exclusive
+            with `generation_batch_size`.
+        temperature (`float`, defaults to `1.0`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        top_p (`float`, *optional*, defaults to `1.0`):
+            Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
+            `1.0` to consider all tokens.
+        top_k (`int`, *optional*):
+            Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is
+            disabled and all tokens are considered.
+        min_p (`float`, *optional*):
+            Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
+            value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range.
+        generation_kwargs (`dict[str, Any]`, *optional*):
+            Additional keyword arguments to pass to [`~transformers.GenerationConfig`] (if using transformers) or
+            `SamplingParams` (if using vLLM) when sampling completions. This can be used to further customize the
+            generation behavior, such as setting `suppress_tokens`, `num_beams`, etc. If it contains keys that conflict
+            with the other generation parameters (like `min_p`, `top_p`, etc.), they will override them.
+        chat_template_kwargs (`dict[str, Any]`, *optional*):
+            Additional keyword arguments to pass to the `apply_chat_template` function when generating completions.
+        repetition_penalty (`float`, *optional*, defaults to `1.0`):
+            Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
+            Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat
+            tokens.
+        use_transformers_paged (`bool`, *optional*, defaults to `False`):
+            Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers`
+            paged implementation will be used for generation instead of the default padded implementation. This
+            parameter is only effective when `use_vllm` is set to `False`.
+        cache_implementation (`str`, *optional*):
+            Implementation of the cache method for faster generation when `use_vllm` is set to `False`.
+
+        > Parameters that control generation acceleration powered by vLLM
+
+        use_vllm (`bool`, *optional*, defaults to `False`):
+            Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation
+            instead of the default model.generate(). Requires `vllm` to be installed.
+        vllm_mode (`str`, *optional*, defaults to `"server"`):
+            Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or
+            `"colocate"`.
+
+            - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM
+              server is running (start with `trl vllm-serve`).
+            - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a
+              separate server but may cause resource contention with training.
+        vllm_model_impl (`str`, *optional*, defaults to `"vllm"`):
+            Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use
+            the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model
+            implementation.
+        vllm_guided_decoding_regex (`str`, *optional*):
+            Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled.
+
+        > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)
+
+        vllm_server_base_url (`str`, *optional*):
+            Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and
+            `vllm_server_port` are ignored.
+        vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`):
+            Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+        vllm_server_port (`int`, *optional*, defaults to `8000`):
+            Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
+        vllm_server_timeout (`float`, *optional*, defaults to `240.0`):
+            Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the
+            timeout, a `ConnectionError` is raised.
+
+        > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)
+
+        vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.3`):
+            Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to
+            `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+            launching the vLLM server via the `--vllm_gpu_memory_utilization` flag.
+        vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
+            Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to
+            `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
+            launching the vLLM server via the `--vllm_tensor_parallel_size` flag.
+        vllm_enable_sleep_mode (`bool`, *optional*, defaults to `False`):
+            Enable vLLM sleep mode to offload weights/cache during the optimizer step. Keeps GPU memory usage low, but
+            waking the engine adds host–device transfer latency.
+
+        > Parameters that control the training
+
+        beta (`float`, *optional*, defaults to `0.05`):
+            KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving training
+            speed.
+        num_iterations (`int`, *optional*, defaults to `1`):
+            Number of iterations per batch (denoted as μ in the algorithm).
+        epsilon (`float`, *optional*, defaults to `0.2`):
+            Epsilon value for clipping.
+        epsilon_high (`float`, *optional*):
+            Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound
+            specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`.
+        reward_weights (`list[float]`, *optional*):
+            Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are
+            weighted equally with weight `1.0`.
+        normalize_advantages (`bool`, *optional*, defaults to `False`):
+            Whether to normalize advantages. Normalization is done per generation batch to have mean `0.0` and standard
+            deviation of `1.0`.
+        reward_clip_range (`tuple[float, float]`, *optional*):
+            Clip range for rewards as (min, max). If `None`, no clipping is applied.
+        mask_truncated_completions (`bool`, *optional*, defaults to `False`):
+            When enabled, truncated completions are excluded from the loss calculation, preventing them from being
+            incorrectly penalized and introducing noise during training. According to the
+            [DAPO](https://huggingface.co/papers/2503.14476) paper, this is a good practice for training stability.
+        sync_ref_model (`bool`, *optional*, defaults to `False`):
+            Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using
+            the `ref_model_mixup_alpha` parameter. This synchronization originates from the
+            [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
+        ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`):
+            α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix
+            between the current policy and the previous reference policy during updates. The reference policy is
+            updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you
+            must set `sync_ref_model=True`.
+        ref_model_sync_steps (`int`, *optional*, defaults to `512`):
+            τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how
+            frequently the current policy is synchronized with the reference policy. To use this parameter, you must
+            set `sync_ref_model=True`.
+
+        > Parameters that control the logging
+
+        log_completions (`bool`, *optional*, defaults to `False`):
+            Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is installed,
+            it prints the sample. If `wandb` and/or `trackio` logging is enabled, it logs it to `wandb` and/or
+            `trackio`.
+        num_completions_to_print (`int`, *optional*):
+            Number of completions to print with `rich`. If `None`, all completions are logged.
+        wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`):
+            Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all prompts
+            are logged.
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    # Parameters that control the model and reference model
+    model_init_kwargs: dict | str | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments for `transformers.AutoModelForCausalLM.from_pretrained`, used when the `model` "
+            "argument of the `RLOOTrainer` is provided as a string."
+        },
+    )
+    disable_dropout: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to disable dropout in the model. This is useful for training with a reference model, as "
+            "it prevents the model from generating different logprobs for the same input."
+        },
+    )
+
+    # Parameters that control the data preprocessing
+    # The default value remove_unused_columns is overwritten from the parent class, because in RLOO we usually rely on
+    # additional columns to compute the reward
+    remove_unused_columns: bool | None = field(
+        default=False,
+        metadata={
+            "help": "Whether to only keep the column 'prompt' in the dataset. If you use a custom reward function "
+            "that requires any column other than 'prompts' and 'completions', you should keep this to `False`."
+        },
+    )
+    max_prompt_length: int | None = field(
+        default=512,
+        metadata={
+            "help": "Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left."
+        },
+    )
+    num_generations: int | None = field(
+        default=2,
+        metadata={
+            "help": "Number of generations to sample. The effective batch size (num_processes * per_device_batch_size "
+            "* gradient_accumulation_steps) must be evenly divisible by this value."
+        },
+    )
+    max_completion_length: int | None = field(
+        default=256,
+        metadata={"help": "Maximum length of the generated completion."},
+    )
+    ds3_gather_for_generation: bool = field(
+        default=True,
+        metadata={
+            "help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for "
+            "generation, improving generation speed. However, disabling this option allows training models that "
+            "exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option "
+            "is not compatible with vLLM generation."
+        },
+    )
+    shuffle_dataset: bool | None = field(
+        default=True,
+        metadata={"help": "Whether to shuffle the training dataset."},
+    )
+
+    # Parameters that control generation
+    generation_batch_size: int | None = field(
+        default=None,
+        metadata={
+            "help": "Batch size to use for generation. If `None`, it defaults to the effective training batch size: "
+            "`per_device_train_batch_size * num_processes * steps_per_generation`."
+        },
+    )
+    steps_per_generation: int | None = field(
+        default=None,
+        metadata={
+            "help": "Number of steps per generation. If `None`, it defaults to `gradient_accumulation_steps`."
+        },
+    )
+    temperature: float = field(
+        default=1.0,
+        metadata={
+            "help": "Temperature for sampling. The higher the temperature, the more random the completions."
+        },
+    )
+    top_p: float = field(
+        default=1.0,
+        metadata={
+            "help": "Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. "
+            "Set to 1.0 to consider all tokens."
+        },
+    )
+    top_k: int | None = field(
+        default=None,
+        metadata={
+            "help": "Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, "
+            "top-k-filtering is disabled and all tokens are considered."
+        },
+    )
+    min_p: float | None = field(
+        default=None,
+        metadata={
+            "help": "Minimum token probability, which will be scaled by the probability of the most likely token. It "
+            "must be a value between 0.0 and 1.0. Typical values are in the 0.01-0.2 range."
+        },
+    )
+    generation_kwargs: dict | None = field(
+        default=None,
+        metadata={
+            "help": "Additional keyword arguments to pass to `GenerationConfig` (if using transformers) or "
+            "`SamplingParams` (if using vLLM) when sampling completions. This can be used to further customize the "
+            "generation behavior, such as setting `suppress_tokens`, `num_beams`, etc. If it contains keys that "
+            "conflict with the other generation parameters (like `min_p`, `top_p`, etc.), they will override them."
+        },
+    )
+    chat_template_kwargs: dict | None = field(
+        default=None,
+        metadata={
+            "help": "Additional keyword arguments to pass to the `apply_chat_template` function when generating "
+            "completions."
+        },
+    )
+    repetition_penalty: float = field(
+        default=1.0,
+        metadata={
+            "help": "Float that penalizes new tokens based on whether they appear in the prompt and the generated "
+            "text so far. Values > 1.0 encourage the model to use new tokens, while values < 1.0 encourage the model "
+            "to repeat tokens."
+        },
+    )
+    use_transformers_paged: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use the `transformers` paged implementation for generation. If set to `True`, the "
+            "`transformers` paged implementation will be used for generation instead of the default padded "
+            "implementation. This parameter is only effective when `use_vllm` is set to `False`."
+        },
+    )
+    cache_implementation: str | None = field(
+        default=None,
+        metadata={
+            "help": "Implementation of the cache method for faster generation when use_vllm is set to False."
+        },
+    )
+
+    # Parameters that control generation acceleration powered by vLLM
+    use_vllm: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for "
+            "generation instead of the default model.generate(). Requires `vllm` to be installed."
+        },
+    )
+    vllm_mode: str = field(
+        default="server",
+        metadata={
+            "help": "Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `'server'` or "
+            "`'colocate'`. `'server'`: The trainer will send generation requests to a separate vLLM server. Make sure "
+            "a TRL vLLM server is running (start with `trl vllm-serve`). `'colocate'`: vLLM will run in the same "
+            "process and share the training GPUs. This avoids the need for a separate server but may cause resource "
+            "contention with training."
+        },
+    )
+    vllm_model_impl: str = field(
+        default="vllm",
+        metadata={
+            "help": "Model implementation to use for vLLM. Must be one of `transformers` or `vllm`. `transformers`: "
+            "Use the `transformers` backend for model implementation. `vllm`: Use the `vllm` library for "
+            "model implementation."
+        },
+    )
+    vllm_enable_sleep_mode: bool = field(
+        default=False,
+        metadata={
+            "help": "Enable vLLM sleep mode to offload weights/cache during the optimizer step. Keeps GPU memory "
+            "usage low, but waking the engine adds host–device transfer latency."
+        },
+    )
+    vllm_guided_decoding_regex: str | None = field(
+        default=None,
+        metadata={
+            "help": "Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled."
+        },
+    )
+
+    # Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)
+    vllm_server_base_url: str | None = field(
+        default=None,
+        metadata={
+            "help": "Base URL for the vLLM server (e.g., 'http://localhost:8000'). If provided, `vllm_server_host` "
+            "and `vllm_server_port` are ignored."
+        },
+    )
+    vllm_server_host: str = field(
+        default="0.0.0.0",
+        metadata={
+            "help": "Host of the vLLM server to connect to. Ignored if vllm_server_base_url is provided."
+        },
+    )
+    vllm_server_port: int = field(
+        default=8000,
+        metadata={
+            "help": "Port of the vLLM server to connect to. Ignored if vllm_server_base_url is provided."
+        },
+    )
+    vllm_server_timeout: float = field(
+        default=240.0,
+        metadata={
+            "help": "Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up "
+            "after the timeout, a `ConnectionError` is raised."
+        },
+    )
+
+    # Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)
+    vllm_gpu_memory_utilization: float = field(
+        default=0.3,
+        metadata={
+            "help": "Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set "
+            "to `'colocate'`. If you are using `vllm_mode='server'`, this parameter must be passed separately when "
+            "launching the vLLM server via the `--vllm_gpu_memory_utilization` flag."
+        },
+    )
+    vllm_tensor_parallel_size: int = field(
+        default=1,
+        metadata={
+            "help": "Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set "
+            "to `'colocate'`. If you are using `vllm_mode='server'`, this parameter must be passed separately when "
+            "launching the vLLM server via the `--vllm_tensor_parallel_size` flag."
+        },
+    )
+
+    # Parameters that control the training
+    beta: float = field(
+        default=0.05,
+        metadata={
+            "help": "KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving "
+            "training speed."
+        },
+    )
+    num_iterations: int = field(
+        default=1,
+        metadata={
+            "help": "Number of iterations per batch (denoted as μ in the algorithm)."
+        },
+    )
+    epsilon: float = field(
+        default=0.2,
+        metadata={"help": "Epsilon value for clipping."},
+    )
+    epsilon_high: float | None = field(
+        default=None,
+        metadata={
+            "help": "Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the "
+            "lower-bound specified in argument `epsilon`. Paper DAPO recommends `0.28`."
+        },
+    )
+    reward_weights: list[float] | None = field(
+        default=None,
+        metadata={
+            "help": "Weights for each reward function. Must match the number of reward functions. If `None`, all "
+            "rewards are weighted equally with weight `1.0`."
+        },
+    )
+    normalize_advantages: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to normalize advantages. Normalization is done per generation batch to have mean `0.0` "
+            "and standard deviation of `1.0`."
+        },
+    )
+    reward_clip_range: tuple[float, float] | None = field(
+        default=None,
+        metadata={
+            "help": "Clip range for rewards as (min, max). If None, no clipping is applied."
+        },
+    )
+    mask_truncated_completions: bool = field(
+        default=False,
+        metadata={
+            "help": "When enabled, truncated completions are excluded from the loss calculation, preventing them from "
+            "being incorrectly penalized and introducing noise during training. According to the DAPO paper, this is "
+            "a good practice for training stability."
+        },
+    )
+    sync_ref_model: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to synchronize the reference model with the active model every `ref_model_sync_steps` "
+            "steps, using the `ref_model_mixup_alpha` parameter."
+        },
+    )
+    ref_model_mixup_alpha: float = field(
+        default=0.6,
+        metadata={
+            "help": "α parameter from the TR-DPO paper, which controls the mix between the current policy and the "
+            "previous reference policy during updates. The reference policy is updated according to the equation: "
+            "`π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you must set `sync_ref_model=True`."
+        },
+    )
+    ref_model_sync_steps: int = field(
+        default=512,
+        metadata={
+            "help": "τ parameter from the TR-DPO paper, which determines how frequently the current policy is "
+            "synchronized with the reference policy. To use this parameter, you must set `sync_ref_model=True`."
+        },
+    )
+
+    # Parameters that control the logging
+    log_completions: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is "
+            "installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`."
+        },
+    )
+    num_completions_to_print: int | None = field(
+        default=None,
+        metadata={
+            "help": "Number of completions to print with `rich`. If `None`, all completions are logged."
+        },
+    )
+    wandb_log_unique_prompts: bool | None = field(
+        default=False,
+        metadata={
+            "help": "Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, "
+            "all prompts are logged."
+        },
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        super().__post_init__()
+
+        num_processes = self.world_size
+        # The current default effective batch size
+        if self.generation_batch_size is None and self.steps_per_generation is None:
+            self.steps_per_generation = self.gradient_accumulation_steps
+            self.generation_batch_size = (
+                self.per_device_train_batch_size
+                * num_processes
+                * self.steps_per_generation
+            )
+        elif (
+            self.generation_batch_size is not None and self.steps_per_generation is None
+        ):
+            # Just ensure the value is divisible by the global batch size
+            if (
+                self.generation_batch_size
+                % (self.per_device_train_batch_size * num_processes)
+                != 0
+            ):
+                raise ValueError(
+                    f"generation_batch_size ({self.generation_batch_size}) must be divisible by the global batch size "
+                    f"({self.per_device_train_batch_size * num_processes})."
+                )
+            self.steps_per_generation = self.generation_batch_size // (
+                self.per_device_train_batch_size * num_processes
+            )
+        elif (
+            self.generation_batch_size is None and self.steps_per_generation is not None
+        ):
+            self.generation_batch_size = (
+                self.per_device_train_batch_size
+                * num_processes
+                * self.steps_per_generation
+            )
+        else:
+            raise ValueError(
+                "'generation_batch_size' and 'steps_per_generation' can not be both configured at the same time"
+            )
+
+        if self.do_eval and self.eval_strategy != "no":
+            # Just ensure the value is divisible by the global batch size
+            if (
+                self.per_device_eval_batch_size * num_processes
+            ) % self.num_generations != 0:
+                raise ValueError(
+                    f"The global eval batch size ({self.per_device_eval_batch_size} * {num_processes}) must be "
+                    f"divisible by num_generations ({self.num_generations})."
+                )
+
+        # The generation batch must contain full prompt groups (no partials), so it must be divisible by
+        # num_generations.
+        if self.generation_batch_size % self.num_generations != 0:
+            raise ValueError(
+                f"generation_batch_size ({self.generation_batch_size}) must be divisible by num_generations "
+                f"({self.num_generations})."
+            )
+
+        if self.num_generations < 2:
+            raise ValueError(
+                "RLOO requires at least 2 generations per prompt to calculate the advantages. You provided "
+                f"{self.num_generations}, which is less than the minimum required."
+            )
diff --git a/src/aixpert/training/training/trl/trainer/rloo_trainer.py b/src/aixpert/training/training/trl/trainer/rloo_trainer.py
new file mode 100644
index 0000000..9af96cc
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/rloo_trainer.py
@@ -0,0 +1,1930 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import textwrap
+import warnings
+from collections import defaultdict, deque
+from collections.abc import Callable
+from contextlib import nullcontext
+from functools import partial
+from pathlib import Path
+from typing import Any
+
+import datasets
+import pandas as pd
+import torch
+import torch.utils.data
+import transformers
+from accelerate import logging
+from accelerate.utils import (
+    broadcast_object_list,
+    gather,
+    gather_object,
+    is_peft_model,
+    set_seed,
+)
+from datasets import Dataset, IterableDataset
+from torch import nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.utils.data import DataLoader, Sampler
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainerCallback,
+    is_trackio_available,
+    is_wandb_available,
+)
+from transformers.trainer_utils import seed_worker
+from transformers.utils import (
+    is_datasets_available,
+    is_peft_available,
+    is_rich_available,
+)
+
+from ..data_utils import (
+    apply_chat_template,
+    is_conversational,
+    prepare_multimodal_messages,
+    prepare_multimodal_messages_vllm,
+)
+from ..extras.profiling import profiling_context, profiling_decorator
+from ..extras.vllm_client import VLLMClient
+from ..import_utils import is_vllm_available
+from ..models import (
+    prepare_deepspeed,
+    prepare_fsdp,
+    prepare_peft_model,
+    unwrap_model_for_generation,
+)
+from .base_trainer import BaseTrainer
+from .callbacks import SyncRefModelCallback
+from .rloo_config import RLOOConfig
+from .utils import (
+    RepeatSampler,
+    disable_dropout_in_model,
+    ensure_master_addr_port,
+    entropy_from_logits,
+    get_config_model_id,
+    identity,
+    nanmax,
+    nanmin,
+    nanstd,
+    pad,
+    print_prompt_completions_sample,
+    selective_log_softmax,
+    shuffle_sequence_dict,
+    split_pixel_values_by_grid,
+    split_tensor_dict,
+    unsplit_pixel_values_by_grid,
+)
+
+
+if is_peft_available():
+    from peft import PeftConfig, PeftModel
+
+if is_vllm_available():
+    from vllm import LLM, SamplingParams
+    from vllm.sampling_params import GuidedDecodingParams
+
+if is_wandb_available():
+    import wandb
+
+if is_trackio_available():
+    import trackio
+
+
+logger = logging.get_logger(__name__)
+
+# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of
+# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model.
+RewardFunc = str | PreTrainedModel | Callable[[list, list], list[float]]
+
+
+class RLOOTrainer(BaseTrainer):
+    """
+    Trainer for the Reinforce Leave One Out (RLOO) method. This algorithm was initially proposed in the paper [Back to
+    Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in
+    LLMs](https://huggingface.co/papers/2402.14740).
+
+    Example:
+
+    ```python
+    from datasets import load_dataset
+    from trl import RLOOTrainer
+
+    dataset = load_dataset("trl-lib/tldr", split="train")
+
+
+    def reward_func(completions, **kwargs):
+        # Dummy reward function that rewards completions with more unique letters.
+        return [float(len(set(completion))) for completion in completions]
+
+
+    trainer = RLOOTrainer(
+        model="Qwen/Qwen2-0.5B-Instruct",
+        reward_funcs=reward_func,
+        train_dataset=dataset,
+    )
+
+    trainer.train()
+    ```
+
+    Args:
+        model (`str | PreTrainedModel`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
+              `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        reward_funcs (`RewardFunc | list[RewardFunc]`):
+            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+            functions with the prompts and completions and sum the rewards. Can be either:
+
+            - A single reward function, such as:
+                - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
+                path to a *directory* containing model weights saved using
+                [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+                using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
+                keyword arguments in `args.model_init_kwargs`.
+                - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
+                - A custom reward function: The function is provided with the prompts and the generated completions,
+                  plus any additional columns in the dataset. It should return a list of rewards. Custom reward
+                  functions can also return `None` when the reward is not applicable to those samples. This is useful
+                  for multi-task training where different reward functions apply to different types of samples. When a
+                  reward function returns `None` for a sample, that reward function is excluded from the reward
+                  calculation for that sample. For more details, see [Using a custom reward
+                  function](#using-a-custom-reward-function).
+
+                  The trainer's state is also passed to the reward function. The trainer's state is an instance of
+                  [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
+                  reward function's signature.
+            - A list of reward functions, where each item can independently be any of the above types. Mixing different
+            types within the list (e.g., a string model ID and a custom reward function) is allowed.
+        args ([`RLOOConfig`], *optional*):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
+            ignored. The format of the samples can be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Dataset | IterableDataset]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. The padding side must be set to "left". If `None`, the
+            processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
+            padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
+            `tokenizer.eos_token` will be used as the default.
+        reward_processing_classes ([`~transformers.PreTrainedTokenizerBase`] or `list[PreTrainedTokenizerBase]`, *optional*):
+            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+
+            - A single processing class: Used when `reward_funcs` contains only one reward function.
+            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+            If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
+            `None`, the tokenizer for the model is automatically loaded using
+            [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
+            functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
+            are ignored.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+            in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+    """
+
+    _tag_names = ["trl", "rloo"]
+    _name = "RLOO"
+    _paper = {
+        "title": "Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMs",
+        "id": "2402.14740",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{ahmadian2024back,
+                title        = {{Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMs}},
+                author       = {Arash Ahmadian and Chris Cremer and Matthias Gall{\'{e}} and Marzieh Fadaee and Julia Kreutzer and Olivier Pietquin and Ahmet {\"{U}}st{\"{u}}n and Sara Hooker},
+                year         = 2024,
+                booktitle    = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), {ACL} 2024, Bangkok, Thailand, August 11-16, 2024},
+                pages        = {12248--12267},
+                publisher    = {Association for Computational Linguistics},
+                editor       = {Lun{-}Wei Ku and Andre Martins and Vivek Srikumar},
+            }"""),
+    }
+
+    def __init__(
+        self,
+        model: str | PreTrainedModel = None,
+        reward_funcs: RewardFunc | list[RewardFunc] = None,
+        args: RLOOConfig | None = None,
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset
+        | IterableDataset
+        | dict[str, Dataset | IterableDataset]
+        | None = None,
+        processing_class: PreTrainedTokenizerBase | ProcessorMixin | None = None,
+        reward_processing_classes: PreTrainedTokenizerBase
+        | list[PreTrainedTokenizerBase]
+        | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[
+            torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
+        ] = (None, None),
+        peft_config: "PeftConfig | None" = None,
+    ):
+        if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
+            warnings.warn(
+                "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
+                "it and want it to remain, please share your comments here: "
+                "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
+                "TRL_EXPERIMENTAL_SILENCE=1."
+            )
+
+        # Args
+        if args is None:
+            model_name = (
+                model if isinstance(model, str) else get_config_model_id(model.config)
+            )
+            model_name = model_name.split("/")[-1]
+            args = RLOOConfig(f"{model_name}-RLOO")
+
+        # Models
+        # Trained model
+        model_init_kwargs = args.model_init_kwargs or {}
+        if isinstance(model, str):
+            model_id = model
+            dtype = model_init_kwargs.get("dtype")
+            if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
+                pass  # dtype is already a torch.dtype or "auto" or None
+            elif isinstance(dtype, str):  # it's a str, but not "auto"
+                dtype = getattr(torch, dtype)
+                model_init_kwargs["dtype"] = dtype
+            else:
+                raise ValueError(
+                    "Invalid `dtype` passed to `RLOOConfig`. Expected either 'auto' or a string representing "
+                    f"a `torch.dtype` (e.g., 'float32'), but got {dtype}."
+                )
+            # Disable caching if gradient checkpointing is enabled (not supported)
+            config = AutoConfig.from_pretrained(model_id)
+            architecture = getattr(transformers, config.architectures[0])
+            model = architecture.from_pretrained(model_id, **model_init_kwargs)
+        else:
+            model_id = get_config_model_id(model.config)
+            if args.model_init_kwargs is not None:
+                logger.warning(
+                    "You passed `model_init_kwargs` to the `RLOOConfig`, but your model is already instantiated. "
+                    "The `model_init_kwargs` will be ignored."
+                )
+
+        # Some models (SmolVLM/Idefics3) don't support `logits_to_keep` argument and error out if we pass it
+        # Inspect the forward method before we wrap the model with PEFT
+        self.model_kwarg_keys = (
+            inspect.signature(model.forward).parameters.keys()
+            if not hasattr(model, "get_base_model")
+            else inspect.signature(model.get_base_model().forward).parameters.keys()
+        )
+
+        if peft_config is not None or (
+            is_peft_available() and isinstance(model, PeftModel)
+        ):
+            model = prepare_peft_model(model, peft_config, args)
+
+        # Processing class
+        if processing_class is None:
+            processing_class = AutoProcessor.from_pretrained(
+                get_config_model_id(model.config), truncation_side="left"
+            )
+
+        # Handle pad token for processors or tokenizers
+        if isinstance(processing_class, ProcessorMixin):
+            tokenizer = processing_class.tokenizer
+        elif isinstance(processing_class, PreTrainedTokenizerBase):
+            tokenizer = processing_class
+        else:
+            raise TypeError(
+                "The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`"
+            )
+
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        self.pad_token = tokenizer.pad_token
+        self.pad_token_id = tokenizer.pad_token_id
+        self.eos_token_id = tokenizer.eos_token_id
+
+        # Reward functions
+        if not isinstance(reward_funcs, list):
+            reward_funcs = [reward_funcs]
+        self.reward_func_names = []
+        for i, reward_func in enumerate(reward_funcs):
+            if isinstance(reward_func, str):
+                reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained(
+                    reward_func, num_labels=1, **model_init_kwargs
+                )
+            if isinstance(
+                reward_funcs[i], nn.Module
+            ):  # Use Module over PretrainedModel for compat w/ compiled models
+                self.reward_func_names.append(
+                    get_config_model_id(reward_funcs[i].config).split("/")[-1]
+                )
+            else:
+                self.reward_func_names.append(reward_funcs[i].__name__)
+        self.reward_funcs = reward_funcs
+
+        # Reward weights
+        if args.reward_weights is not None:
+            if len(args.reward_weights) != len(reward_funcs):
+                raise ValueError(
+                    f"Number of reward weights ({len(args.reward_weights)}) must match number of reward "
+                    f"functions ({len(reward_funcs)})"
+                )
+            self.reward_weights = torch.tensor(args.reward_weights, dtype=torch.float32)
+        else:
+            self.reward_weights = torch.ones(len(reward_funcs), dtype=torch.float32)
+
+        # Reward processing class
+        if reward_processing_classes is None:
+            reward_processing_classes = [None] * len(reward_funcs)
+        elif not isinstance(reward_processing_classes, list):
+            reward_processing_classes = [reward_processing_classes]
+        if len(reward_processing_classes) != len(reward_funcs):
+            raise ValueError(
+                f"The number of reward processing classes ({len(reward_processing_classes)}) must match the number of "
+                f"reward functions ({len(reward_funcs)})."
+            )
+
+        for i, (reward_processing_class, reward_func) in enumerate(
+            zip(reward_processing_classes, reward_funcs, strict=True)
+        ):
+            if isinstance(reward_func, PreTrainedModel):
+                if reward_processing_class is None:
+                    reward_processing_class = AutoTokenizer.from_pretrained(
+                        get_config_model_id(reward_func.config)
+                    )
+                if reward_processing_class.pad_token_id is None:
+                    reward_processing_class.pad_token = (
+                        reward_processing_class.eos_token
+                    )
+                # The reward model computes the reward for the latest non-padded token in the input sequence.
+                # So it's important to set the pad token ID to the padding token ID of the processing class.
+                reward_func.config.pad_token_id = reward_processing_class.pad_token_id
+                reward_processing_classes[i] = reward_processing_class
+
+        self.reward_processing_classes = reward_processing_classes
+
+        # Training arguments
+        self.max_prompt_length = args.max_prompt_length
+        self.max_completion_length = args.max_completion_length
+        self.num_generations = args.num_generations
+        self.chat_template_kwargs = args.chat_template_kwargs or {}
+        self.temperature = args.temperature
+        self.top_p = args.top_p
+        self.top_k = args.top_k
+        self.min_p = args.min_p
+        self.repetition_penalty = args.repetition_penalty
+        self.use_transformers_paged = args.use_transformers_paged
+        self.use_vllm = args.use_vllm
+        self.vllm_mode = args.vllm_mode
+        self.vllm_gpu_memory_utilization = (
+            args.vllm_gpu_memory_utilization
+        )  # only applies to colocation mode
+        self.vllm_tensor_parallel_size = (
+            args.vllm_tensor_parallel_size
+        )  # only applies to colocation mode
+        self.normalize_advantages = args.normalize_advantages
+        self.mask_truncated_completions = args.mask_truncated_completions
+        self.reward_clip_range = args.reward_clip_range
+
+        # Datasets
+        self.shuffle_dataset = args.shuffle_dataset
+
+        if (
+            isinstance(train_dataset, IterableDataset)
+            or isinstance(eval_dataset, IterableDataset)
+            or (
+                isinstance(eval_dataset, dict)
+                and any(isinstance(ds, IterableDataset) for ds in eval_dataset.values())
+            )
+        ):
+            # See https://github.com/huggingface/trl/issues/3213
+            raise NotImplementedError(
+                "Iterable datasets are not yet supported in RLOOTrainer. Please use a standard dataset instead."
+            )
+
+        # Multi-step
+        self.num_iterations = args.num_iterations
+        self.epsilon_low = args.epsilon
+        self.epsilon_high = (
+            args.epsilon_high if args.epsilon_high is not None else args.epsilon
+        )
+        # Tracks the number of iterations (forward + backward passes), including those within a grad accum cycle
+        self._step = 0
+        # Buffer the batch to reuse generated outputs across multiple updates. For more details, see
+        # `_get_train_sampler` and `_prepare_inputs`.
+        self._buffered_inputs = None
+
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in RLOO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning:
+        # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To
+        # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True.
+        # This acts as a flag to indicate that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=identity,  # No data collation is needed in RLOO
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            callbacks=callbacks,
+            optimizers=optimizers,
+        )
+
+        # Reference model
+        self.beta = args.beta
+        if self.beta == 0.0:
+            # If beta is 0.0, the reference model is not needed
+            self.ref_model = None
+        elif is_peft_model(model):
+            # If PEFT is used, the reference model is not needed since the adapter can be disabled
+            # to revert to the initial model.
+            self.ref_model = None
+        else:
+            # For deepspeed, fsdp or non-distributed models, create a reference model from scratch
+            config = AutoConfig.from_pretrained(model_id)
+            architecture = getattr(transformers, config.architectures[0])
+            self.ref_model = architecture.from_pretrained(model_id, **model_init_kwargs)
+
+        # Disable dropout in the models
+        if args.disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+
+        # Initialize the metrics
+        self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)}
+        self._total_train_tokens = 0
+        self.log_completions = args.log_completions
+        self.wandb_log_unique_prompts = args.wandb_log_unique_prompts
+        self.num_completions_to_print = args.num_completions_to_print
+        # Keep logs sized to the generation batch to record only outputs from the latest model update.
+        self._logs = {
+            "images": deque(maxlen=args.generation_batch_size),
+            "prompt": deque(maxlen=args.generation_batch_size),
+            "completion": deque(maxlen=args.generation_batch_size),
+            "rewards": defaultdict(lambda: deque(maxlen=args.generation_batch_size)),
+            "advantages": deque(maxlen=args.generation_batch_size),
+        }
+
+        # Ensure each process receives a unique seed to prevent duplicate completions when generating with
+        # transformers if num_generations exceeds per_device_train_batch_size. We could skip it if we use vLLM, but
+        # it's safer to set it in all cases.
+        set_seed(args.seed, device_specific=True)
+
+        if self.use_vllm:
+            if not is_vllm_available():
+                raise ImportError(
+                    "vLLM is not available and `use_vllm` is set to True. Please install vLLM with "
+                    "`pip install trl[vllm]` to use it."
+                )
+
+            if self.vllm_mode == "server":
+                if self.accelerator.is_main_process:
+                    if args.vllm_server_base_url is not None:
+                        base_url = args.vllm_server_base_url
+                    else:
+                        base_url = (
+                            f"http://{args.vllm_server_host}:{args.vllm_server_port}"
+                        )
+                    self.vllm_client = VLLMClient(
+                        base_url=base_url, connection_timeout=args.vllm_server_timeout
+                    )
+                    self.vllm_client.init_communicator(
+                        device=torch.cuda.current_device()
+                    )
+
+            elif self.vllm_mode == "colocate":
+                # Make sure vllm_tensor_parallel_size group size evenly divides the world size - each group should have
+                # the same number of ranks
+                if (
+                    not self.accelerator.num_processes % self.vllm_tensor_parallel_size
+                    == 0
+                ):
+                    raise ValueError(
+                        f"vllm_tensor_parallel_size ({self.vllm_tensor_parallel_size}) must divide world size "
+                        f"({self.accelerator.num_processes}) evenly."
+                    )
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Create subgroups of ranks for TP, each group with `vllm_tensor_parallel_size` ranks.
+                    # For example, if world_size=8 and vllm_tensor_parallel_size=2 → groups: [0,1], [2,3], [4,5], [6,7]
+                    self.tp_group, _ = torch.distributed.new_subgroups_by_enumeration(
+                        [
+                            list(
+                                range(
+                                    i * self.vllm_tensor_parallel_size,
+                                    (i + 1) * self.vllm_tensor_parallel_size,
+                                )
+                            )
+                            for i in range(
+                                self.accelerator.num_processes
+                                // self.vllm_tensor_parallel_size
+                            )
+                        ]
+                    )
+
+                # vLLM requires the environment variables to be set for distributed training.
+                os.environ["RANK"] = str(self.accelerator.process_index)
+                os.environ["LOCAL_RANK"] = str(self.accelerator.local_process_index)
+                os.environ["WORLD_SIZE"] = str(self.accelerator.num_processes)
+                # Ensure distributed rendezvous variables are set without colliding across concurrent runs
+                ensure_master_addr_port()
+
+                if (
+                    self.max_prompt_length is not None
+                    and self.max_completion_length is not None
+                ):
+                    max_model_len = self.max_prompt_length + self.max_completion_length
+                else:
+                    max_model_len = None
+                self.llm = LLM(
+                    model=model.name_or_path,
+                    tensor_parallel_size=args.vllm_tensor_parallel_size,
+                    gpu_memory_utilization=self.vllm_gpu_memory_utilization,
+                    max_num_seqs=self.args.per_device_train_batch_size
+                    * self.vllm_tensor_parallel_size
+                    * self.args.steps_per_generation,
+                    max_model_len=max_model_len,
+                    distributed_executor_backend="external_launcher",
+                    # Feed identical seed for tp groups to ensure sampling results are the same across workers
+                    seed=self.accelerator.process_index
+                    // self.vllm_tensor_parallel_size,
+                    # Latest vLLM v1 memory profiler is misled by the high default value (i.e., 32768) - thinking there's not enough memory
+                    max_num_batched_tokens=4096,
+                    model_impl=self.args.vllm_model_impl,
+                    enable_sleep_mode=self.args.vllm_enable_sleep_mode,
+                )
+                if self.args.vllm_enable_sleep_mode:
+                    self.llm.sleep(level=2)
+            else:
+                raise ValueError(
+                    f"vllm_mode must be either 'server' or 'colocate', got '{self.vllm_mode}'."
+                )
+
+            # vLLM specific sampling arguments
+            self.guided_decoding_regex = args.vllm_guided_decoding_regex
+
+            self._last_loaded_step = (
+                -1
+            )  # tag to avoid useless loading during grad accumulation
+
+            # When using vLLM, the main process is responsible for loading the model weights. This can cause process
+            # desynchronization and seems to lead to DeepSpeed hanging during initialization. To prevent this, we
+            # synchronize all processes after vLLM has been fully initialized.
+            self.accelerator.wait_for_everyone()
+        else:
+            generation_kwargs = {
+                "max_new_tokens": self.max_completion_length,
+                "do_sample": True,
+                "pad_token_id": tokenizer.pad_token_id,
+                "bos_token_id": tokenizer.bos_token_id,
+                "eos_token_id": tokenizer.eos_token_id,
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "top_k": self.top_k,
+                "min_p": self.min_p,
+                "repetition_penalty": self.repetition_penalty,
+                "cache_implementation": args.cache_implementation,
+            }
+            if args.generation_kwargs is not None:
+                generation_kwargs.update(args.generation_kwargs)
+            self.generation_config = GenerationConfig(**generation_kwargs)
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags to the model
+        self.model.add_model_tags(self._tag_names)
+
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            elif self.is_fsdp_enabled:
+                self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(
+                    self.ref_model, evaluation_mode=True
+                )
+
+        if args.sync_ref_model:
+            self.add_callback(
+                SyncRefModelCallback(
+                    ref_model=self.ref_model, accelerator=self.accelerator
+                )
+            )
+
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                if self.is_deepspeed_enabled:
+                    self.reward_funcs[i] = prepare_deepspeed(
+                        reward_func, self.accelerator
+                    )
+                else:
+                    # set device placement to True to make `prepare_model` move `reward_func` to device when using fsdp
+                    self.reward_funcs[i] = self.accelerator.prepare_model(
+                        reward_func, evaluation_mode=True, device_placement=True
+                    )
+
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In RLOOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by the `training_step` method, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = ["prompt", "image", "images"]
+
+    # This method overrides `Trainer.get_train_dataloader` to support our custom batching strategy.
+    # Instead of returning a standard per-step batch (i.e., `per_device_batch_size), our dataloader loads an
+    # *generation* batch (i.e., `per_device_batch_size × steps_per_generation`). This allows us to generate completions
+    # once every steps_per_generation step—rather than once per accumulation step—which is significantly more
+    # efficient. The only change from the original implementation is multiplying the batch size by
+    # `steps_per_generation`. Thus, `_prepare_inputs` is called with this *generation* batch, and it handles the
+    # splitting internally.
+    # Maintenance note: This method is a copy-paste of the original `Trainer.get_train_dataloader` with only one line
+    # modification. As a result, some parts of the method aren't relevant to RLOO, but we keep them to stay one line
+    # apart from the super method, ensuring easier maintenance in the future.
+    def get_train_dataloader(self):
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+
+        train_dataset = self.train_dataset
+        data_collator = self.data_collator
+        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
+            train_dataset = self._remove_unused_columns(
+                train_dataset, description="training"
+            )
+        else:
+            data_collator = self._get_collator_with_removed_columns(
+                data_collator, description="training"
+            )
+
+        dataloader_params = {
+            "batch_size": self._train_batch_size
+            * self.args.steps_per_generation,  # < this is the change
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+        }
+
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_train_sampler()
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = partial(
+                seed_worker,
+                num_workers=self.args.dataloader_num_workers,
+                rank=self.args.process_index,
+            )
+
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
+
+        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
+
+    def _get_train_sampler(self, dataset: Dataset | None = None) -> Sampler:
+        # Returns a sampler that
+        # 1. ensures each prompt is repeated across multiple processes. This guarantees that identical prompts are
+        #    distributed to different GPUs, allowing rewards to be computed and normalized correctly within each prompt
+        #    group. Using the same seed across processes ensures consistent prompt assignment, preventing discrepancies
+        #    in group formation.
+        # 2. repeats the batch multiple times to allow reusing generations across multiple updates. Refer to
+        #    _prepare_inputs to see how the generations are stored and reused.
+
+        # In the following figure, the values are the prompt indices. The first row shows the first sampled batch, the
+        # second row shows the second sampled batch, and so on.
+        #
+        #                                      |   GPU 0  |   GPU 1  |
+        #
+        #                 global_step   step    <-───>  num_generations=2
+        #                                       <-───────> per_device_train_batch_size=3
+        #  grad_accum    ▲  ▲  0          0     0   0   1   1   2   2   <- Generate for the first `steps_per_generation` (prompts 0 to 11); store the completions; use the first slice to compute the loss
+        #     =2         ▼  |  0          1     3   3   4   4   5   5   <- Take the stored generations and use the second slice to compute the loss
+        #                   |
+        #                   |  1          2     6   6   7   7   8   8   <- Take the stored generations and use the third slice to compute the loss
+        #  steps_per_gen=4  ▼  1          3     9   9  10  10  11  11   <- Take the stored generations and use the fourth slice to compute the loss
+        #
+        #                      2          4    12  12  13  13  14  14   <- Generate for the second `steps_per_generation` (prompts 12 to 23); store the completions; use the first slice to compute the loss
+        #                      2          5    15  15  16  16  17  17   <- Take the stored generations and use the second slice to compute the loss
+        #                                          ...
+        if dataset is None:
+            dataset = self.train_dataset
+        return RepeatSampler(
+            data_source=dataset,
+            mini_repeat_count=self.num_generations,
+            batch_size=self.args.generation_batch_size // self.num_generations,
+            repeat_count=self.num_iterations * self.args.steps_per_generation,
+            shuffle=self.shuffle_dataset,
+            seed=self.args.seed,
+        )
+
+    def _get_eval_sampler(self, eval_dataset) -> Sampler:
+        # See _get_train_sampler for an explanation of the sampler.
+        return RepeatSampler(
+            data_source=eval_dataset,
+            mini_repeat_count=self.num_generations,
+            seed=self.args.seed,
+        )
+
+    @profiling_decorator
+    def _get_per_token_logps_and_entropies(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        logits_to_keep,
+        batch_size=None,
+        compute_entropy=False,
+        pixel_values=None,
+        image_grid_thw=None,
+        num_images=None,
+        pixel_attention_mask=None,
+        image_sizes=None,
+        token_type_ids=None,
+    ) -> dict[str, torch.Tensor | None]:
+        """Compute log-probs and (optionally) entropies for each token."""
+        batch_size = batch_size or input_ids.size(
+            0
+        )  # Chunk inputs into smaller batches to reduce memory peak
+        all_logps = []
+        all_entropies = []
+        for start in range(0, input_ids.size(0), batch_size):
+            input_ids_batch = input_ids[start : start + batch_size]
+            attention_mask_batch = attention_mask[start : start + batch_size]
+
+            # Build model inputs - check if the model supports logits_to_keep (some models and VLMs don't)
+            model_inputs = {
+                "input_ids": input_ids_batch,
+                "attention_mask": attention_mask_batch,
+            }
+
+            if image_grid_thw is not None and pixel_values is not None:
+                rows_per_image = image_grid_thw.prod(dim=-1)
+                rows_per_sample = torch.split(rows_per_image, num_images)
+                rows_per_sample = torch.stack([s.sum() for s in rows_per_sample])
+                cum_rows = torch.cat(
+                    [
+                        torch.tensor([0], device=rows_per_sample.device),
+                        rows_per_sample.cumsum(0),
+                    ]
+                )
+                row_start, row_end = (
+                    cum_rows[start].item(),
+                    cum_rows[start + batch_size].item(),
+                )
+                model_inputs["pixel_values"] = pixel_values[row_start:row_end]
+                cum_imgs = torch.tensor([0] + num_images).cumsum(0)
+                img_start, img_end = cum_imgs[start], cum_imgs[start + batch_size]
+                model_inputs["image_grid_thw"] = image_grid_thw[img_start:img_end]
+            elif pixel_values is not None:
+                model_inputs["pixel_values"] = pixel_values[start : start + batch_size]
+            if pixel_attention_mask is not None:
+                model_inputs["pixel_attention_mask"] = pixel_attention_mask[
+                    start : start + batch_size
+                ]
+            if image_sizes is not None:
+                model_inputs["image_sizes"] = image_sizes[start : start + batch_size]
+            if token_type_ids is not None:
+                model_inputs["token_type_ids"] = token_type_ids[
+                    start : start + batch_size
+                ]
+
+            # Only add logits_to_keep if the model supports it
+            if "logits_to_keep" in self.model_kwarg_keys:
+                # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
+                model_inputs["logits_to_keep"] = logits_to_keep + 1
+
+            model_inputs["use_cache"] = (
+                False  # only used in generation; set False to suppress warnings
+            )
+
+            logits = model(**model_inputs).logits
+            # Exclude the last value: it corresponds to the next token pred
+            logits = logits[:, :-1, :]  # (B, L-1, H)
+            # Only keep the last logits_to_keep. For model that support logits_to_keep, this is a no-op.
+            logits = logits[:, -logits_to_keep:, :]  # (B, logits_to_keep, H)
+            # Divide logits by sampling temperature.
+            # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
+            logits = logits / self.temperature
+
+            completion_ids = input_ids_batch[:, -logits_to_keep:]
+            logps = selective_log_softmax(logits, completion_ids)  # compute logprobs
+            all_logps.append(logps)
+
+            if compute_entropy:
+                with torch.no_grad():
+                    entropies = entropy_from_logits(logits)
+                all_entropies.append(entropies)
+
+        logps = torch.cat(all_logps, dim=0)
+        entropies = torch.cat(all_entropies, dim=0) if compute_entropy else None
+        return logps, entropies
+
+    def _fix_param_name_to_vllm(self, name, extra_prefixes: list[str] | None = None):
+        extra_prefixes = extra_prefixes or []
+        prefixes = ["_checkpoint_wrapped_module."] + extra_prefixes
+        for prefix in prefixes:
+            name = name.replace(prefix, "")
+        return name
+
+    def _sync_fsdp1_params_to_vllm(
+        self, module: nn.Module, prefix: str = "", visited=None
+    ):
+        """Memory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM."""
+        # For FSDP1, we need to recurse into children and also use summon_full_params
+        if visited is None:
+            visited = set()
+        for child_name, child_module in module.named_children():
+            child_prefix = f"{prefix}.{child_name}" if prefix else child_name
+            self._sync_fsdp1_params_to_vllm(
+                child_module, prefix=child_prefix, visited=visited
+            )  # recurse into the child
+
+        if isinstance(module, FSDP):
+            with FSDP.summon_full_params(module, recurse=False, writeback=False):
+                for param_name, param in module.named_parameters():
+                    full_name = f"{prefix}.{param_name}" if prefix else param_name
+                    full_name = self._fix_param_name_to_vllm(
+                        full_name, extra_prefixes=["_fsdp_wrapped_module."]
+                    )
+
+                    if full_name in visited:
+                        continue  # skip FSDP subtrees already traversed
+                    visited.add(full_name)
+
+                    if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(full_name, param.data)
+                    elif self.vllm_mode == "colocate":
+                        llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                        llm_model.load_weights([(full_name, param.data)])
+
+    def _sync_fsdp2_params_to_vllm(self, module: nn.Module):
+        # For FSDP2, module.state_dict() already covers all parameters, so no need for recursion
+        for name, param in module.state_dict().items():
+            if param.is_cpu:
+                param = param.to(torch.device("cuda"))
+            param = param.full_tensor()
+
+            if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                self.vllm_client.update_named_param(name, param)
+            elif self.vllm_mode == "colocate":
+                llm_model = (
+                    self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                )
+                llm_model.load_weights([(name, param)])
+
+    @profiling_decorator
+    def _move_model_to_vllm(self):
+        # For DeepSpeed ZeRO-3 and FSDP, we need to gather all parameters before operations
+        deepspeed_plugin = self.accelerator.state.deepspeed_plugin
+        zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
+        if zero_stage_3:
+            import deepspeed
+
+            gather_if_zero3 = deepspeed.zero.GatheredParameters
+        else:
+            gather_if_zero3 = nullcontext
+
+        if is_peft_model(self.model):
+            # With PEFT and FSDP/DeepSpeed ZeRO Stage 3, we must gather the full model at once before merging, as
+            # merging adapters in a sharded manner is not supported.
+            # TODO: does this work with FSDP?
+            with gather_if_zero3(list(self.model.parameters())):
+                self.model.merge_adapter()
+
+                # Update vLLM weights while parameters are gathered
+                if (
+                    self.is_fsdp_enabled
+                ):  # note if using FSDP, gather_if_zero3 is nullcontext
+                    # Update vLLM weights while parameters are gathered
+                    # For PEFT with FSDP we need to use the memory efficient post-order traversal
+                    fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None)
+                    fsdp_version = (
+                        getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1
+                    )
+                    if fsdp_version == 1:
+                        self._sync_fsdp1_params_to_vllm(
+                            self.model
+                        )  # use memory-efficient post-order traversal for FSDP
+                    elif fsdp_version == 2:
+                        self._sync_fsdp2_params_to_vllm(self.model)
+                else:
+                    # DeepSpeed ZeRO-3 with PEFT
+                    for name, param in self.model.named_parameters():
+                        # When using PEFT, we need to recover the original parameter name and discard some parameters
+                        name = name.removeprefix("base_model.model.").replace(
+                            ".base_layer", ""
+                        )
+                        if self.model.prefix in name:
+                            continue
+                        # When module to save, remove its prefix and discard the original module
+                        if "original_module" in name:
+                            continue
+                        name = self._fix_param_name_to_vllm(
+                            name, extra_prefixes=["modules_to_save.default."]
+                        )
+
+                        if (
+                            self.vllm_mode == "server"
+                            and self.accelerator.is_main_process
+                        ):
+                            self.vllm_client.update_named_param(name, param.data)
+                        elif self.vllm_mode == "colocate":
+                            llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                            llm_model.load_weights([(name, param.data)])
+                # Unmerge adapters while parameters are still gathered
+                self.model.unmerge_adapter()
+                # Parameters will automatically be repartitioned when exiting the context
+        # For non-PEFT models, simply gather (if needed) and update each parameter individually.
+        elif self.is_fsdp_enabled:
+            fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None)
+            fsdp_version = getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1
+            if fsdp_version == 1:
+                self._sync_fsdp1_params_to_vllm(
+                    self.model
+                )  # use memory-efficient post-order traversal for FSDP
+            elif fsdp_version == 2:
+                self._sync_fsdp2_params_to_vllm(self.model)
+        else:
+            for name, param in self.model.named_parameters():
+                name = self._fix_param_name_to_vllm(name)
+                with gather_if_zero3([param]):
+                    if self.vllm_mode == "server" and self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(name, param.data)
+                    elif self.vllm_mode == "colocate":
+                        llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                        llm_model.load_weights([(name, param.data)])
+
+        # Reset cache on vLLM
+        if self.vllm_mode == "server" and self.accelerator.is_main_process:
+            self.vllm_client.reset_prefix_cache()
+        elif self.vllm_mode == "colocate":
+            self.llm.reset_prefix_cache()
+
+    @profiling_decorator
+    def _prepare_inputs(
+        self, generation_batch: dict[str, torch.Tensor | Any]
+    ) -> dict[str, torch.Tensor | Any]:
+        # Prepares inputs for model training/evaluation by managing completion generation and batch handling.
+        # During training:
+        #   - Receives the local generation batch (Per-GPU batch size × steps per generation)
+        #     from the modified training dataloader instead of the standard local batch
+        #   - Generates completions once for the entire generation batch and splits it into batches of size
+        #     `per_device_train_batch_size`
+        #   - Buffers these completions and returns the appropriate slice for the current accumulation step
+        #   - Optimizes by regenerating completions only periodically (every steps_per_generation * num_iterations)
+        # During evaluation:
+        #   - The input is treated as a standard local batch (no accumulation, no multiple iterations)
+        #   - Completions are generated for each batch without buffering or reuse
+        # Returns a single local batch in both cases.
+
+        mode = "train" if self.model.training else "eval"
+        if mode == "train":
+            generate_every = self.args.steps_per_generation * self.num_iterations
+            if self._step % generate_every == 0 or self._buffered_inputs is None:
+                # self._buffered_inputs=None can occur when resuming from a checkpoint
+                generation_batch = self._generate_and_score_completions(
+                    generation_batch
+                )
+                generation_batch = split_pixel_values_by_grid(generation_batch)
+                generation_batch = shuffle_sequence_dict(generation_batch)
+                generation_batches = split_tensor_dict(
+                    generation_batch, self.args.steps_per_generation
+                )
+                self._buffered_inputs = [
+                    unsplit_pixel_values_by_grid(batch) for batch in generation_batches
+                ]
+            inputs = self._buffered_inputs[self._step % self.args.steps_per_generation]
+            self._step += 1
+        else:
+            # In evaluation, there is neither batch grouping for generation, nor multiple iterations, hence
+            # local generation batch == local eval batch
+            inputs = self._generate_and_score_completions(generation_batch)
+        return inputs
+
+    @profiling_decorator
+    def _calculate_rewards(self, inputs, prompts, completions, completion_ids_list):
+        device = self.accelerator.device
+        rewards_per_func = torch.zeros(
+            len(prompts), len(self.reward_funcs), device=device
+        )
+
+        # Repeat all input columns (but "prompt", "completion", and "completion_ids") to match the num of generations
+        keys = [
+            key
+            for key in inputs[0]
+            if key not in ["prompt", "completion", "completion_ids"]
+        ]
+        reward_kwargs = {key: [example[key] for example in inputs] for key in keys}
+
+        # This allows for dynamic reward shaping based on training progress.
+        reward_kwargs["trainer_state"] = self.state
+
+        for i, (reward_func, reward_processing_class, reward_func_name) in enumerate(
+            zip(
+                self.reward_funcs,
+                self.reward_processing_classes,
+                self.reward_func_names,
+                strict=True,
+            )
+        ):
+            with profiling_context(self, reward_func_name):
+                if isinstance(
+                    reward_func, nn.Module
+                ):  # Module (no PretrainedModel) for compat with compiled models
+                    if is_conversational(inputs[0]):
+                        messages = [
+                            {"messages": p + c}
+                            for p, c in zip(prompts, completions, strict=True)
+                        ]
+                        texts = [
+                            apply_chat_template(
+                                x, reward_processing_class, **self.chat_template_kwargs
+                            )["text"]
+                            for x in messages
+                        ]
+                    else:
+                        texts = [
+                            p + c for p, c in zip(prompts, completions, strict=True)
+                        ]
+                    reward_inputs = reward_processing_class(
+                        text=texts,
+                        return_tensors="pt",
+                        padding=True,
+                        padding_side="right",
+                        add_special_tokens=False,
+                    )
+                    reward_inputs = super()._prepare_inputs(reward_inputs)
+                    with torch.inference_mode():
+                        rewards_per_func[:, i] = reward_func(**reward_inputs).logits[
+                            :, 0
+                        ]  # Shape (B*G,)
+                else:
+                    output_reward_func = reward_func(
+                        prompts=prompts,
+                        completions=completions,
+                        completion_ids=completion_ids_list,
+                        **reward_kwargs,
+                    )
+                    # Convert None values to NaN
+                    output_reward_func = [
+                        reward if reward is not None else torch.nan
+                        for reward in output_reward_func
+                    ]
+
+                    rewards_per_func[:, i] = torch.tensor(
+                        output_reward_func, dtype=torch.float32, device=device
+                    )
+
+        # If all reward functions return None for a given row, issue a detailed warning
+        if torch.isnan(rewards_per_func).all(dim=1).any():
+            nan_row_idx = (
+                torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0]
+            )
+            row_reward_kwargs = {
+                key: value[nan_row_idx]
+                for key, value in reward_kwargs.items()
+                if key != "trainer_state"
+            }
+            row_reward_kwargs["prompt"] = prompts[nan_row_idx]
+            row_reward_kwargs["completion"] = completions[nan_row_idx]
+            logger.warning(
+                f"All reward functions returned None for the following kwargs:\n{row_reward_kwargs}\n"
+                "Please ensure that at least one reward function returns a valid reward."
+            )
+
+        # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
+        # completions may be distributed across processes
+        rewards_per_func = gather(rewards_per_func)
+        return rewards_per_func
+
+    def _generate_single_turn(self, prompts: list):
+        device = self.accelerator.device
+
+        # Generate completions using either vLLM or regular generation
+        if self.use_vllm:
+            if self.vllm_mode == "colocate" and self.args.vllm_enable_sleep_mode:
+                # wake up colocated vLLM instances if needed
+                torch.cuda.empty_cache()  # required to avoid OOM in some cases
+                self.llm.wake_up(tags=["weights"])
+
+            # First, update the vLLM weights if needed
+            if self.state.global_step != self._last_loaded_step:
+                self._move_model_to_vllm()
+                self._last_loaded_step = self.state.global_step
+
+            prompts = [prepare_multimodal_messages_vllm(prompt) for prompt in prompts]
+
+            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
+            if self.vllm_mode == "server":
+                all_prompts = gather_object(prompts)
+
+                if self.accelerator.is_main_process:
+                    # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
+                    # num_generations outputs for each one. This is faster than generating outputs for each duplicate
+                    # prompt individually.
+                    ordered_set_of_prompts = all_prompts[:: self.num_generations]
+
+                    sampling_params = {
+                        "n": self.num_generations,
+                        "repetition_penalty": self.repetition_penalty,
+                        "temperature": self.temperature,
+                        "top_p": self.top_p,
+                        "top_k": -1 if self.top_k is None else self.top_k,
+                        "min_p": 0.0 if self.min_p is None else self.min_p,
+                        "max_tokens": self.max_completion_length,
+                        "truncate_prompt_tokens": self.max_prompt_length,
+                        "guided_decoding_regex": self.guided_decoding_regex,
+                        "generation_kwargs": self.args.generation_kwargs,
+                    }
+                    with profiling_context(self, "vLLM.generate"):
+                        if is_conversational({"prompt": ordered_set_of_prompts[0]}):
+                            output = self.vllm_client.chat(
+                                messages=ordered_set_of_prompts,
+                                **sampling_params,
+                                chat_template_kwargs=self.chat_template_kwargs,
+                            )
+                        else:
+                            output = self.vllm_client.generate(
+                                prompts=ordered_set_of_prompts, **sampling_params
+                            )
+                        payload = (
+                            output["prompt_ids"],
+                            output["completion_ids"],
+                            output["logprobs"],
+                        )
+                else:
+                    payload = None
+
+                # Broadcast the completions from the main process to all processes, ensuring each process receives its corresponding slice.
+                obj_list = [payload]
+                broadcast_object_list(obj_list, from_process=0)
+                all_prompt_ids, all_completion_ids, _ = obj_list[0]
+
+                # At this point, we only get 1 copy of each prompt, so we need to repeat them num_generations times
+                all_prompt_ids = [
+                    ids for ids in all_prompt_ids for _ in range(self.num_generations)
+                ]
+
+                process_slice = slice(
+                    self.accelerator.process_index * len(prompts),
+                    (self.accelerator.process_index + 1) * len(prompts),
+                )
+                prompt_ids = all_prompt_ids[process_slice]
+                completion_ids = all_completion_ids[process_slice]
+
+            # Generate completions using colocated vLLM instances: each device holds vLLM copy and work on their own batch of prompts
+            elif self.vllm_mode == "colocate":
+                if self.guided_decoding_regex:
+                    guided_decoding = GuidedDecodingParams(
+                        regex=self.guided_decoding_regex
+                    )
+                else:
+                    guided_decoding = None
+
+                generation_kwargs = {
+                    "n": 1,  # vLLM on each GPU generates only 1 in colocate mode
+                    "repetition_penalty": self.repetition_penalty,
+                    "temperature": self.temperature,
+                    "top_p": self.top_p,
+                    "top_k": -1 if self.top_k is None else self.top_k,
+                    "min_p": 0.0 if self.min_p is None else self.min_p,
+                    "max_tokens": self.max_completion_length,
+                    "truncate_prompt_tokens": self.max_prompt_length,
+                    "guided_decoding": guided_decoding,
+                }
+                if self.args.generation_kwargs is not None:
+                    generation_kwargs.update(self.args.generation_kwargs)
+                sampling_params = SamplingParams(**generation_kwargs)
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Gather prompts from all ranks in the TP group and flatten.
+                    # Each rank starts with its own prompts; after gathering, all ranks see the full group set.
+                    orig_size = len(prompts)
+                    gathered_prompts = [
+                        None for _ in range(self.vllm_tensor_parallel_size)
+                    ]
+                    torch.distributed.all_gather_object(
+                        gathered_prompts, prompts, group=self.tp_group
+                    )
+                    all_prompts = [p for sublist in gathered_prompts for p in sublist]
+                else:
+                    all_prompts = prompts
+
+                if self.args.vllm_enable_sleep_mode:
+                    self.llm.wake_up(tags=["kv_cache"])
+
+                with profiling_context(self, "vLLM.generate"):
+                    if is_conversational({"prompt": prompts[0]}):
+                        all_outputs = self.llm.chat(
+                            all_prompts, sampling_params=sampling_params, use_tqdm=False
+                        )
+                    else:
+                        all_outputs = self.llm.generate(
+                            all_prompts, sampling_params=sampling_params, use_tqdm=False
+                        )
+
+                all_prompt_ids = [output.prompt_token_ids for output in all_outputs]
+                all_completion_ids = [
+                    output.token_ids
+                    for outputs in all_outputs
+                    for output in outputs.outputs
+                ]
+
+                if self.vllm_tensor_parallel_size > 1:
+                    # Slice completions for this rank within its TP group.
+                    # Each rank generates all outputs — we keep only our share.
+                    local_rank_in_group = torch.distributed.get_rank(
+                        group=self.tp_group
+                    )
+                    tp_slice = slice(
+                        local_rank_in_group * orig_size,
+                        (local_rank_in_group + 1) * orig_size,
+                    )
+                    prompt_ids = all_prompt_ids[tp_slice]
+                    completion_ids = all_completion_ids[tp_slice]
+                else:
+                    prompt_ids = all_prompt_ids
+                    completion_ids = all_completion_ids
+
+                if self.args.vllm_enable_sleep_mode:
+                    self.llm.sleep(level=2)
+
+        elif self.use_transformers_paged:
+            processor_kwargs = {
+                "max_length": self.max_prompt_length,
+                "truncation": True,
+                "add_special_tokens": False,
+            }
+            if is_conversational({"prompt": prompts[0]}):
+                processor_outputs = self.processing_class.apply_chat_template(
+                    conversation=prompts,
+                    **processor_kwargs,
+                    add_generation_prompt=True,
+                    tokenize=True,
+                    return_dict=True,
+                    **self.chat_template_kwargs,
+                )
+            else:
+                processor_outputs = self.processing_class(
+                    text=prompts, **processor_kwargs
+                )
+
+            with (
+                profiling_context(self, "transformers.generate_batch"),
+                unwrap_model_for_generation(
+                    self.model_wrapped,
+                    self.accelerator,
+                    gather_deepspeed3_params=self.args.ds3_gather_for_generation,
+                ) as unwrapped_model,
+                torch.no_grad(),
+                FSDP.summon_full_params(self.model_wrapped, recurse=False)
+                if self.is_fsdp_enabled
+                else nullcontext(),
+            ):
+                # Cast to the appropriate dtype based on training configuration
+                if self.args.bf16:
+                    unwrapped_model.to(torch.bfloat16)
+                elif self.args.fp16:
+                    unwrapped_model.to(torch.float16)
+                with torch.inference_mode():
+                    # Continuous batching API expects 'inputs' arg only
+                    all_outputs = unwrapped_model.generate_batch(
+                        processor_outputs["input_ids"],
+                        generation_config=self.generation_config,
+                        progress_bar=False,
+                    )
+                    unwrapped_model.train()  # restore training mode, as generate_batch forces eval mode
+            completion_ids = [
+                output.generated_tokens for output in all_outputs.values()
+            ]
+            prompt_ids = processor_outputs["input_ids"]
+
+        else:
+            # Regular generation path
+            processor_kwargs = {
+                "return_tensors": "pt",
+                "padding": True,
+                "padding_side": "left",
+                "max_length": self.max_prompt_length,
+                "truncation": True,
+                "add_special_tokens": False,
+            }
+            if is_conversational({"prompt": prompts[0]}):
+                generate_inputs = self.processing_class.apply_chat_template(
+                    conversation=prompts,
+                    **processor_kwargs,
+                    add_generation_prompt=True,
+                    tokenize=True,
+                    return_dict=True,
+                    **self.chat_template_kwargs,
+                )
+            else:
+                generate_inputs = self.processing_class(
+                    text=prompts, **processor_kwargs
+                )
+            generate_inputs = super()._prepare_inputs(generate_inputs)
+
+            with (
+                profiling_context(self, "transformers.generate"),
+                unwrap_model_for_generation(
+                    self.model_wrapped,
+                    self.accelerator,
+                    gather_deepspeed3_params=self.args.ds3_gather_for_generation,
+                ) as unwrapped_model,
+                torch.no_grad(),
+                FSDP.summon_full_params(self.model_wrapped, recurse=False)
+                if self.is_fsdp_enabled
+                else nullcontext(),
+            ):
+                prompt_completion_ids = unwrapped_model.generate(
+                    **generate_inputs,
+                    generation_config=self.generation_config,
+                    disable_compile=True,
+                )
+            # Compute prompt length and extract completion ids
+            prompt_ids, prompt_mask = (
+                generate_inputs["input_ids"],
+                generate_inputs["attention_mask"],
+            )
+            prompt_length = prompt_ids.size(1)
+            completion_ids = prompt_completion_ids[:, prompt_length:]
+
+            # Mask everything after the first EOS token
+            is_eos = completion_ids == self.eos_token_id
+            eos_idx = torch.full(
+                (is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device
+            )
+            eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
+            sequence_indices = torch.arange(is_eos.size(1), device=device).expand(
+                is_eos.size(0), -1
+            )
+            completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
+            prompt_ids = [
+                p[m].tolist()
+                for p, m in zip(prompt_ids, prompt_mask.bool(), strict=True)
+            ]
+            completion_ids = [
+                c[m].tolist()
+                for c, m in zip(completion_ids, completion_mask.bool(), strict=True)
+            ]
+
+        return prompt_ids, completion_ids
+
+    def _generate(self, prompts: list):
+        device = self.accelerator.device
+        mode = "train" if self.model.training else "eval"
+
+        prompt_ids, completion_ids = self._generate_single_turn(prompts)
+
+        # Get completion length per sequence, used for logging
+        prompt_lengths = torch.tensor([len(ids) for ids in prompt_ids], device=device)
+        completion_lengths = torch.tensor(
+            [len(ids) for ids in completion_ids], device=device
+        )
+        agg_prompt_lengths = self.accelerator.gather(prompt_lengths)
+        agg_completion_lengths = self.accelerator.gather(completion_lengths)
+        total_prompt_tokens = agg_prompt_lengths.sum()
+        total_completion_tokens = (
+            agg_completion_lengths.sum()
+        )  # = num_items_in_batch, required for the DAPO loss
+
+        # Log the metrics
+        if mode == "train":
+            self.state.num_input_tokens_seen += (
+                total_prompt_tokens + total_completion_tokens
+            ).item()
+        self._metrics[mode]["num_tokens"] = [self.state.num_input_tokens_seen]
+
+        # Log completion lengths, mean, min, max
+        agg_completion_lengths = self.accelerator.gather(completion_lengths)
+        self._metrics[mode]["completions/mean_length"].append(
+            agg_completion_lengths.float().mean().item()
+        )
+        self._metrics[mode]["completions/min_length"].append(
+            agg_completion_lengths.float().min().item()
+        )
+        self._metrics[mode]["completions/max_length"].append(
+            agg_completion_lengths.float().max().item()
+        )
+
+        # Identify sequences that terminated with EOS and log their lengths
+        eos_and_pad = [self.eos_token_id, self.pad_token_id]
+        is_truncated = torch.tensor(
+            [ids[-1] not in eos_and_pad for ids in completion_ids], device=device
+        )
+        agg_is_truncated = self.accelerator.gather(is_truncated)
+        self._metrics[mode]["completions/clipped_ratio"].append(
+            agg_is_truncated.float().mean().item()
+        )
+        term_completion_lengths = agg_completion_lengths[~agg_is_truncated]
+        if (
+            len(term_completion_lengths) == 0
+        ):  # edge case where no terminated sequences are found
+            term_completion_lengths = torch.zeros(1, device=device)
+        self._metrics[mode]["completions/mean_terminated_length"].append(
+            term_completion_lengths.float().mean().item()
+        )
+        self._metrics[mode]["completions/min_terminated_length"].append(
+            term_completion_lengths.float().min().item()
+        )
+        self._metrics[mode]["completions/max_terminated_length"].append(
+            term_completion_lengths.float().max().item()
+        )
+
+        return prompt_ids, completion_ids
+
+    def _generate_and_score_completions(
+        self, inputs: list[dict[str, torch.Tensor | Any]]
+    ) -> dict[str, torch.Tensor | Any]:
+        device = self.accelerator.device
+        mode = "train" if self.model.training else "eval"
+
+        prompts = [x["prompt"] for x in inputs]
+
+        if "images" in inputs[0]:
+            images = [example.get("images") for example in inputs]
+        elif "image" in inputs[0]:
+            images = [
+                [example.get("image")] if example.get("image") is not None else None
+                for example in inputs
+            ]
+        else:
+            images = None
+        # Transformers requires at least one image in the batch, otherwise it throws an error
+        if images is not None and all(img_list == [] for img_list in images):
+            images = None
+
+        # If the prompts are conversational and the inputs contain images, we need to convert the prompts from
+        # [{"role": "user", "content": "What color is the sky?"}] to
+        # [{"role": "user", "content": [{"type": "image", "image": <Image>}, {"type": "text", "text": "What color is the sky?"}]}]
+        if images is not None:
+            prompts = [
+                prepare_multimodal_messages(prompt, image_list)
+                for prompt, image_list in zip(prompts, images, strict=True)
+            ]
+
+        prompt_ids_list, completion_ids_list = self._generate(prompts)
+
+        # Convert lists of token IDs to padded tensors
+        prompt_ids = [torch.tensor(ids, device=device) for ids in prompt_ids_list]
+        prompt_mask = [torch.ones_like(ids, dtype=torch.long) for ids in prompt_ids]
+        prompt_ids = pad(
+            prompt_ids, padding_value=self.pad_token_id, padding_side="left"
+        )
+        prompt_mask = pad(prompt_mask, padding_value=0, padding_side="left")
+        completion_ids = [
+            torch.tensor(ids, device=device) for ids in completion_ids_list
+        ]
+        completion_mask = [
+            torch.ones_like(ids, dtype=torch.long) for ids in completion_ids
+        ]
+        completion_ids = pad(
+            completion_ids, padding_value=self.pad_token_id, padding_side="right"
+        )
+        completion_mask = pad(completion_mask, padding_value=0, padding_side="right")
+
+        # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
+        if self.mask_truncated_completions:
+            eos_and_pad = [self.eos_token_id, self.pad_token_id]
+            is_truncated = torch.tensor(
+                [ids[-1] not in eos_and_pad for ids in completion_ids_list],
+                device=device,
+            )
+            completion_mask = completion_mask * (~is_truncated).unsqueeze(1).int()
+
+        # Concatenate prompt_mask with completion_mask for logit computation
+        prompt_completion_ids = torch.cat(
+            [prompt_ids, completion_ids], dim=1
+        )  # (B, P+C)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
+
+        logits_to_keep = completion_ids.size(
+            1
+        )  # we only need to compute the logits for the completion tokens
+        batch_size = (
+            self.args.per_device_train_batch_size
+            if mode == "train"
+            else self.args.per_device_eval_batch_size
+        )
+
+        num_images = (
+            [len(img_list) for img_list in images] if images is not None else None
+        )
+
+        # Get forward_kwargs for models with multimodal inputs
+        if images is not None:
+            prompts_text = [
+                apply_chat_template(
+                    {"prompt": prompt},
+                    self.processing_class,
+                    **self.chat_template_kwargs,
+                )["prompt"]
+                for prompt in prompts
+            ]
+            prompt_inputs = self.processing_class(
+                images=images, text=prompts_text, padding=True, return_tensors="pt"
+            )
+            prompt_inputs = super()._prepare_inputs(prompt_inputs)
+            forward_kwargs = {
+                k: v
+                for k, v in prompt_inputs.items()
+                if k not in ["input_ids", "attention_mask"]
+            }
+        else:
+            forward_kwargs = {}
+
+        # If token_type_ids are used, extend them with zeros for the completion part
+        if "token_type_ids" in forward_kwargs:
+            token_type_ids = forward_kwargs["token_type_ids"]
+            forward_kwargs["token_type_ids"] = torch.cat(
+                [token_type_ids, token_type_ids.new_zeros(completion_ids.shape)], dim=1
+            )
+
+        with torch.no_grad():
+            # Compute the per-token log probabilities for the current model
+            old_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                self.model,
+                prompt_completion_ids,
+                attention_mask,
+                logits_to_keep,
+                batch_size,
+                num_images=num_images,
+                **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+            )
+            old_logps = (old_per_token_logps * completion_mask).sum(
+                1
+            )  # mask out padding and tokens after EOS
+
+            # Compute the per-token log probabilities for the reference model
+            if self.beta != 0.0:
+                if self.ref_model is not None:
+                    ref_per_token_logps, _ = self._get_per_token_logps_and_entropies(
+                        self.ref_model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        logits_to_keep,
+                        batch_size=batch_size,
+                        num_images=num_images,
+                        **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                    )
+                else:
+                    with self.accelerator.unwrap_model(self.model).disable_adapter():
+                        ref_per_token_logps, _ = (
+                            self._get_per_token_logps_and_entropies(
+                                self.model,
+                                prompt_completion_ids,
+                                attention_mask,
+                                logits_to_keep,
+                                batch_size=batch_size,
+                                num_images=num_images,
+                                **forward_kwargs,  # may contain pixel_values, image_grid_thw, pixel_attention_mask and image_sizes
+                            )
+                        )
+            else:
+                ref_per_token_logps = None
+
+        # Decode
+        prompts_text = self.processing_class.batch_decode(
+            prompt_ids, skip_special_tokens=True
+        )
+        completions_text = self.processing_class.batch_decode(
+            completion_ids, skip_special_tokens=True
+        )
+        if is_conversational(inputs[0]):
+            completions = []
+            for prompt, completion in zip(prompts, completions_text, strict=True):
+                bootstrap = (
+                    prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+                )
+                if isinstance(
+                    bootstrap, list
+                ):  # for VLM, the format might be [{"type": "text", "text": "..."}]
+                    assert len(bootstrap) == 1 and bootstrap[0]["type"] == "text"
+                    bootstrap = bootstrap[0]["text"]
+                completions.append(
+                    [{"role": "assistant", "content": bootstrap + completion}]
+                )
+        else:
+            completions = completions_text
+
+        # Calculate rewards for each reward function. rewards_per_func aggregates rewards across all processes. This is
+        # important because rewards will be normalized per group, and completions are distributed. We will later slice
+        # rewards_per_func to extract each process's subset.
+        rewards_per_func = self._calculate_rewards(
+            inputs, prompts, completions, completion_ids_list
+        )
+
+        # Apply weights to each reward function's output and sum
+        rewards = (
+            rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
+        ).nansum(dim=1)
+
+        # Apply reward clipping if specified
+        if self.reward_clip_range:
+            rewards = rewards.clamp(
+                min=self.reward_clip_range[0], max=self.reward_clip_range[1]
+            )
+
+        # Include the KL penalty in the reward
+        if self.beta != 0.0:
+            per_token_kl = old_per_token_logps - ref_per_token_logps
+            # Apply sequence-level KL penalty to rewards (sum KL across tokens first, then apply to each sequence)
+            kl = (per_token_kl * completion_mask).sum(-1)
+            kl = gather(kl)  # rewards are gathered, so kl must be too
+            rewards = rewards - self.beta * kl
+
+        grouped_rewards = rewards.view(-1, self.num_generations)
+        mean_grouped_rewards = grouped_rewards.mean(dim=1)
+        std_rewards = grouped_rewards.std(dim=1)
+        is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards))
+
+        # RLOO advantages computation
+        grouped_sum = grouped_rewards.sum(dim=1, keepdim=True)  # (num_prompts, 1)
+        baselines = (grouped_sum - grouped_rewards) / (
+            self.num_generations - 1
+        )  # (num_prompts, num_generations)
+        baselines = baselines.view(-1)  # Flatten back to match rewards shape
+        advantages = rewards - baselines
+
+        # Normalize advantages
+        if self.normalize_advantages:
+            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-4)
+
+        # Slice to keep only the local part of the data
+        process_slice = slice(
+            self.accelerator.process_index * len(prompts),
+            (self.accelerator.process_index + 1) * len(prompts),
+        )
+        all_process_advantages = (
+            advantages.clone()
+        )  # keep the aggregated advantages for logging
+        advantages = advantages[process_slice]
+
+        # Calculate and log the mean KL divergence between current and reference model
+        if self.beta != 0.0:
+            mean_kl = (
+                per_token_kl * completion_mask
+            ).sum() / completion_mask.sum().clamp(min=1.0)
+            self._metrics[mode]["kl"].append(
+                self.accelerator.gather(mean_kl).nanmean().item()
+            )
+
+        # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values)
+        for i, reward_func_name in enumerate(self.reward_func_names):
+            mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards)
+            std_func_rewards = nanstd(rewards_per_func[:, i]).item()
+            self._metrics[mode][f"rewards/{reward_func_name}/std"].append(
+                std_func_rewards
+            )
+        self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item())
+        self._metrics[mode]["reward_std"].append(std_rewards.mean().item())
+        self._metrics[mode]["frac_reward_zero_std"].append(
+            is_std_zero.float().mean().item()
+        )
+
+        # Log prompt and completion texts
+        self._logs["prompt"].extend(gather_object(prompts_text))
+        self._logs["completion"].extend(gather_object(completions_text))
+        for i, name in enumerate(self.reward_func_names):
+            self._logs["rewards"][name].extend(rewards_per_func[:, i].tolist())
+        self._logs["advantages"].extend(all_process_advantages.tolist())
+
+        if images is not None:
+            self._logs["images"].extend(gather_object(images))
+
+        output = {
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "old_logps": old_logps,
+            "advantages": advantages,
+        }
+        if "pixel_values" in forward_kwargs:
+            output["pixel_values"] = forward_kwargs["pixel_values"]
+        if "image_grid_thw" in forward_kwargs:
+            output["image_grid_thw"] = forward_kwargs["image_grid_thw"]
+        if "pixel_attention_mask" in forward_kwargs:
+            output["pixel_attention_mask"] = forward_kwargs["pixel_attention_mask"]
+        if "image_sizes" in forward_kwargs:
+            output["image_sizes"] = forward_kwargs["image_sizes"]
+        if "token_type_ids" in forward_kwargs:
+            output["token_type_ids"] = forward_kwargs["token_type_ids"]
+        if images is not None:
+            output["num_images"] = num_images
+        return output
+
+    @profiling_decorator
+    def compute_loss(
+        self, model, inputs, return_outputs=False, num_items_in_batch=None
+    ):
+        if return_outputs:
+            raise ValueError("The RLOOTrainer does not support returning outputs")
+        return self._compute_loss(model, inputs)
+
+    def _compute_loss(self, model, inputs):
+        # Compute the per-token log probabilities for the model
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = (
+            inputs["completion_ids"],
+            inputs["completion_mask"],
+        )
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        logits_to_keep = completion_ids.size(
+            1
+        )  # we only need to compute the logits for the completion tokens
+
+        # Compute the per_token_logps and the entropy at each position in the completion
+        per_token_logps, entropies = self._get_per_token_logps_and_entropies(
+            model,
+            input_ids,
+            attention_mask,
+            logits_to_keep,
+            compute_entropy=True,
+            pixel_values=inputs.get("pixel_values"),
+            image_grid_thw=inputs.get("image_grid_thw"),
+            num_images=inputs.get("num_images"),
+            pixel_attention_mask=inputs.get("pixel_attention_mask"),
+            image_sizes=inputs.get("image_sizes"),
+            token_type_ids=inputs.get("token_type_ids"),
+        )
+
+        logps = (per_token_logps * completion_mask).sum(
+            1
+        )  # mask out padding and tokens after EOS
+        old_logps = inputs["old_logps"]
+        log_ratio = logps - old_logps
+
+        # Compute the loss
+        advantages = inputs["advantages"]
+        coef_1 = torch.exp(log_ratio)
+        coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
+        per_sequence_loss1 = coef_1 * advantages
+        per_sequence_loss2 = coef_2 * advantages
+        per_sequence_loss = -torch.min(per_sequence_loss1, per_sequence_loss2)
+        loss = per_sequence_loss.mean()
+
+        # Log the metrics
+        mode = "train" if self.model.training else "eval"
+
+        # Entropy
+        mean_entropy = (
+            entropies * completion_mask
+        ).sum() / completion_mask.sum().clamp(min=1.0)
+        self._metrics[mode]["entropy"].append(
+            self.accelerator.gather(mean_entropy).nanmean().item()
+        )
+
+        # Compute the clipped probability ratios
+        is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages < 0)
+        is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (advantages > 0)
+        is_region_clipped = is_low_clipped | is_high_clipped
+        gathered_low_clip = self.accelerator.gather(is_low_clipped.float().mean())
+        self._metrics[mode]["clip_ratio/low_mean"].append(
+            gathered_low_clip.nanmean().item()
+        )
+        self._metrics[mode]["clip_ratio/low_min"].append(
+            nanmin(gathered_low_clip).item()
+        )
+        gathered_high_clip = self.accelerator.gather(is_high_clipped.float().mean())
+        self._metrics[mode]["clip_ratio/high_mean"].append(
+            gathered_high_clip.nanmean().item()
+        )
+        self._metrics[mode]["clip_ratio/high_max"].append(
+            nanmax(gathered_high_clip).item()
+        )
+        gathered_clip_ratio = self.accelerator.gather(is_region_clipped.float().mean())
+        self._metrics[mode]["clip_ratio/region_mean"].append(
+            gathered_clip_ratio.nanmean().item()
+        )
+        return loss
+
+    def prediction_step(
+        self, model, inputs, prediction_loss_only, ignore_keys: list[str] | None = None
+    ):
+        inputs = self._prepare_inputs(inputs)
+        with torch.no_grad():
+            with self.compute_loss_context_manager():
+                loss = self.compute_loss(model, inputs)
+            loss = loss.mean().detach()
+        return loss, None, None
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        mode = "train" if self.model.training else "eval"
+        metrics = {
+            key: sum(val) / len(val) for key, val in self._metrics[mode].items()
+        }  # average the metrics
+
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if mode == "eval":
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+
+        logs = {**logs, **metrics}
+        super().log(logs, start_time)
+        self._metrics[mode].clear()
+
+        if self.accelerator.is_main_process and self.log_completions:
+            if is_rich_available():
+                print_prompt_completions_sample(
+                    self._logs["prompt"],
+                    self._logs["completion"],
+                    self._logs["rewards"],
+                    self._logs["advantages"],
+                    self.state.global_step,
+                    self.num_completions_to_print,
+                )
+
+            logging_backends = []
+            if (
+                self.args.report_to
+                and "wandb" in self.args.report_to
+                and wandb.run is not None
+            ):
+                logging_backends.append(wandb)
+            if self.args.report_to and "trackio" in self.args.report_to:
+                logging_backends.append(trackio)
+
+            table = {
+                "step": [str(self.state.global_step)] * len(self._logs["prompt"]),
+                "prompt": self._logs["prompt"],
+                "completion": self._logs["completion"],
+                **self._logs["rewards"],
+                "advantage": self._logs["advantages"],
+            }
+
+            df_base = pd.DataFrame(table)
+            images_raw = self._logs["images"] or []
+
+            for logging_backend in logging_backends:
+                if images_raw:
+                    images = []
+                    for image_list in self._logs["images"]:
+                        images.append(
+                            [logging_backend.Image(image) for image in image_list]
+                        )
+                    df = pd.concat(
+                        [df_base, pd.Series(images, name="image")],
+                        axis=1,
+                        copy=False,
+                    )
+                else:
+                    df = df_base
+
+                if self.wandb_log_unique_prompts:
+                    df = df.drop_duplicates(subset=["prompt"])
+
+                logging_backend.log(
+                    {"completions": logging_backend.Table(dataframe=df)}
+                )
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/trainer/sft_config.py b/src/aixpert/training/training/trl/trainer/sft_config.py
new file mode 100644
index 0000000..f16a7f6
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/sft_config.py
@@ -0,0 +1,267 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class SFTConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`SFTTrainer`].
+
+    This class includes only the parameters that are specific to SFT training. For a full list of training arguments,
+    please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
+    differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        > Parameters that control the model
+
+        model_init_kwargs (`dict[str, Any]`, *optional*):
+            Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
+            argument of the [`SFTTrainer`] is provided as a string. If you're training a MoE architecture and want to
+            include the load balancing/auxilliary loss as a part of the final loss, remember to set
+            `output_router_logits=True` in this dictionary.
+        chat_template_path (`str`, *optional*):
+            If specified, sets the model's chat template. This can either be the path to a tokenizer (local directory
+            or Hugging Face Hub model) or a direct path to a Jinja template file. When using a Jinja file, you must
+            ensure that any special tokens referenced in the template are added to the tokenizer and that the model's
+            embedding layer is resized accordingly.
+
+        > Parameters that control the data preprocessing
+
+        dataset_text_field (`str`, *optional*, defaults to `"text"`):
+            Name of the column that contains text data in the dataset.
+        dataset_kwargs (`dict[str, Any]`, *optional*):
+            Dictionary of optional keyword arguments for the dataset preparation. The only supported key is
+            `skip_prepare_dataset`. When the model is a VLM, `skip_prepare_dataset` is automatically treated as `True`
+            regardless of the provided value, since preprocessing is done on the fly.
+        dataset_num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+        eos_token (`str`, *optional*):
+            Token used to indicate the end of a turn or sequence. If `None`, it defaults to
+            `processing_class.eos_token`.
+        pad_token (`str`, *optional*):
+            Token used for padding. If `None`, it defaults to `processing_class.pad_token`, or if that is also `None`,
+            it falls back to `processing_class.eos_token`.
+        max_length (`int` or `None`, *optional*, defaults to `1024`):
+            Maximum length of the tokenized sequence. Sequences longer than `max_length` are truncated from the right.
+            If `None`, no truncation is applied. When packing is enabled, this value sets the sequence length.
+        packing (`bool`, *optional*, defaults to `False`):
+            Whether to group multiple sequences into fixed-length blocks to improve computational efficiency and reduce
+            padding. Uses `max_length` to define sequence length.
+        packing_strategy (`str`, *optional*, defaults to `"bfd"`):
+            Strategy for packing sequences. Can be either `"bfd"` (best-fit decreasing, default), or `"wrapped"`.
+        padding_free (`bool`, *optional*, defaults to `False`):
+            Whether to perform forward passes without padding by flattening all sequences in the batch into a single
+            continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this is only
+            supported with the FlashAttention 2 or 3, which can efficiently handle the flattened batch structure. When
+            packing is enabled with strategy `"bfd"`, padding-free is enabled, regardless of the value of this
+            parameter.
+        pad_to_multiple_of (`int`, *optional*):
+            If set, the sequences will be padded to a multiple of this value.
+        eval_packing (`bool`, *optional*):
+            Whether to pack the eval dataset. If `None`, uses the same value as `packing`.
+
+        > Parameters that control the training
+
+        completion_only_loss (`bool`, *optional*):
+            Whether to compute loss only on the completion part of the sequence. If set to `True`, loss is computed
+            only on the completion, which is supported only for [prompt-completion](#prompt-completion) datasets. If
+            `False`, loss is computed on the entire sequence. If `None` (default), the behavior depends on the dataset:
+            loss is computed on the completion for [prompt-completion](#prompt-completion) datasets, and on the full
+            sequence for [language modeling](#language-modeling) datasets.
+        assistant_only_loss (`bool`, *optional*, defaults to `False`):
+            Whether to compute loss only on the assistant part of the sequence. If set to `True`, loss is computed only
+            on the assistant responses, which is supported only for [conversational](#conversational) datasets. If
+            `False`, loss is computed on the entire sequence.
+        loss_type (`str`, *optional*, defaults to `"nll"`):
+            Type of loss to use. Possible values are `"nll"` (negative log-likelihood, default) and `"dft"` (Dynamic
+            Fine-Tuning, as described in [this paper](https://huggingface.co/papers/2508.05629)).
+        activation_offloading (`bool`, *optional*, defaults to `False`):
+            Whether to offload the activations to the CPU.
+    """
+
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["model_init_kwargs"]
+
+    # Parameters whose default values are overridden from TrainingArguments
+    learning_rate: float = field(
+        default=2e-5,
+        metadata={"help": "The initial learning rate for AdamW."},
+    )
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    # Parameters that control the model
+    model_init_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments for `AutoModelForCausalLM.from_pretrained`, used when the `model` argument of "
+            "the `SFTTrainer` is provided as a string. If you're training a MoE architecture and want to include the "
+            "load balancing/auxilliary loss as a part of the final loss, remember to set `output_router_logits=True` "
+            "in this dictionary."
+        },
+    )
+    chat_template_path: str | None = field(
+        default=None,
+        metadata={
+            "help": "If specified, sets the model's chat template. This can either be the path to a tokenizer (local "
+            "directory or Hugging Face Hub model) or a direct path to a Jinja template file. When using a Jinja file, "
+            "you must ensure that any special tokens referenced in the template are added to the tokenizer and "
+            "that the model's embedding layer is resized accordingly."
+        },
+    )
+
+    # Parameters that control the data preprocessing
+    dataset_text_field: str = field(
+        default="text",
+        metadata={"help": "Name of the column that contains text data in the dataset."},
+    )
+    dataset_kwargs: dict[str, Any] | None = field(
+        default=None,
+        metadata={
+            "help": "Dictionary of optional keyword arguments for the dataset preparation. The only supported key is "
+            "`skip_prepare_dataset`. If the model is a VLM, `skip_prepare_dataset` value is ignored. When the model "
+            "is a VLM, `skip_prepare_dataset` is automatically treated as `True` regardless of the provided value, "
+            "since preprocessing is done on the fly."
+        },
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    eos_token: str | None = field(
+        default=None,
+        metadata={
+            "help": "Token used to indicate the end of a turn or sequence. If `None`, it defaults to `processing_class.eos_token`."
+        },
+    )
+    pad_token: str | None = field(
+        default=None,
+        metadata={
+            "help": "Token used for padding. If `None`, it defaults to `processing_class.pad_token`, or if that "
+            "is also `None`, it falls back to `processing_class.eos_token`."
+        },
+    )
+    max_length: int | None = field(
+        default=1024,
+        metadata={
+            "help": "Maximum length of the tokenized sequence. Sequences longer than `max_length` are truncated from"
+            "the right. If `None`, no truncation is applied. When packing is enabled, this value sets the "
+            "sequence length."
+        },
+    )
+    packing: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to group multiple sequences into fixed-length blocks to improve computational efficiency "
+            "and reduce padding. Uses `max_length` to define sequence length."
+        },
+    )
+    packing_strategy: str = field(
+        default="bfd",
+        metadata={
+            "help": "Strategy for packing sequences. Can be either `'bfd'` (best-fit decreasing, default), or "
+            "`'wrapped'`."
+        },
+    )
+    padding_free: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to perform forward passes without padding by flattening all sequences in the batch into "
+            "a single continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this "
+            "is only supported with the FlashAttention 2 or 3, which can efficiently handle the flattened batch "
+            "structure. When packing is enabled with strategy `'bfd'`, padding-free is enabled, regardless of the "
+            "value of this parameter."
+        },
+    )
+    pad_to_multiple_of: int | None = field(
+        default=None,
+        metadata={
+            "help": "If set, the sequences will be padded to a multiple of this value."
+        },
+    )
+    eval_packing: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to pack the eval dataset. If `None`, uses the same value as `packing`."
+        },
+    )
+
+    # Parameters that control the training
+    completion_only_loss: bool | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Whether to compute loss only on the completion part of the sequence. If set to `True`, loss is "
+                "computed only on the completion, which is supported only for prompt-completion datasets. If `False`, "
+                "loss is computed on the entire sequence. If `None` (default), the behavior depends on the dataset: "
+                "loss is computed on the completion for prompt-completion datasets, and on the full sequence for "
+                "language modeling datasets."
+            )
+        },
+    )
+    assistant_only_loss: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to compute loss only on the assistant part of the sequence. If set to `True`, loss is "
+                "computed only on the assistant responses, which is supported only for conversational datasets. If `False`, "
+                "loss is computed on the entire sequence."
+            )
+        },
+    )
+    loss_type: str = field(
+        default="nll",
+        metadata={
+            "help": (
+                'Type of loss to use. Possible values are `"nll"` (negative log-likelihood, default) and `"dft"` '
+                "(Dynamic Fine-Tuning, as described in https://huggingface.co/papers/2508.05629)."
+            )
+        },
+    )
+    activation_offloading: bool = field(
+        default=False,
+        metadata={"help": "Whether to offload the activations to the CPU."},
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+        super().__post_init__()
diff --git a/src/aixpert/training/training/trl/trainer/sft_trainer.py b/src/aixpert/training/training/trl/trainer/sft_trainer.py
new file mode 100644
index 0000000..0782827
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/sft_trainer.py
@@ -0,0 +1,1435 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import os
+from collections import defaultdict
+from collections.abc import Callable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import torch
+from accelerate import PartialState, logging
+from datasets import Dataset, IterableDataset
+from torch import nn
+from transformers import (
+    AutoProcessor,
+    BaseImageProcessor,
+    DataCollator,
+    FeatureExtractionMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    TrainingArguments,
+)
+from transformers.data.data_collator import DataCollatorMixin
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalPrediction
+from transformers.utils import is_peft_available
+
+from ..data_utils import (
+    apply_chat_template,
+    is_conversational,
+    is_conversational_from_value,
+    maybe_convert_to_chatml,
+    pack_dataset,
+    prepare_multimodal_messages,
+    truncate_dataset,
+)
+from ..models import (
+    clone_chat_template,
+    get_act_offloading_ctx_manager,
+    prepare_peft_model,
+)
+from .base_trainer import BaseTrainer
+from .sft_config import SFTConfig
+from .utils import (
+    create_model_from_path,
+    entropy_from_logits,
+    flush_left,
+    get_config_model_id,
+    pad,
+    remove_none_values,
+    selective_log_softmax,
+)
+
+
+if is_peft_available():
+    from peft import PeftConfig, PeftModel, PeftType
+
+
+logger = logging.get_logger(__name__)
+
+
+FLASH_ATTENTION_VARIANTS = {
+    "flash_attention_2",
+    "flash_attention_3",
+    "kernels-community/flash-attn",
+    "kernels-community/vllm-flash-attn3",
+    "kernels-community/flash-attn3",
+}
+
+
+def get_dataset_column_names(dataset: Dataset | IterableDataset) -> list[str]:
+    return (
+        list(next(iter(dataset)).keys())
+        if dataset.column_names is None
+        else dataset.column_names
+    )
+
+
+@dataclass
+class DataCollatorForLanguageModeling(DataCollatorMixin):
+    """
+    Data collator used for language modeling data. Inputs are dynamically padded to the maximum length of a batch.
+
+    This collator expects each example in the input list to be a dictionary containing at least the `"input_ids"` key.
+    If the input contains a `"completion_mask"`, it is used to set the labels to `-100` for tokens that are not in the
+    completion. If `"assistant_masks"` are present, they are used to set the labels to `-100` for tokens that are not
+    in the assistant part of the sequence. The collator returns a dictionary containing the following keys:
+    - `"input_ids"`: Tensor of input IDs, padded to the maximum length of the batch.
+    - `"labels"`: Tensor of labels, padded to the maximum length of the batch. If `completion_only_loss` is set to
+    `True`, tokens that are not in the completion are set to -100. If `assistant_masks` are present, tokens that are
+    not in the assistant part of the sequence are set to -100. If `padding_free` is set to `False`, the following key
+    is also returned:
+    - `"attention_mask"`: Tensor of attention masks, padded to the maximum length of the batch.
+    If `padding_free` is set to `True`, the following key is also returned:
+    - `"position_ids"`: Tensor of position IDs, padded to the maximum length of the batch.
+
+    Args:
+        pad_token_id (`int`):
+            Token ID to use for padding.
+        completion_only_loss (`bool`, *optional*, defaults to `True`):
+            When the input contains a completion mask (`completion_mask`), the labels are set to -100 for the tokens
+            that are no in the completion.
+        padding_free (`bool`, *optional*, defaults to `False`):
+            If set to `True`, the sequences will be flattened into a single sequence, and the position IDs will be
+            generated accordingly and returned instead of the attention mask.
+        pad_to_multiple_of (`int`, *optional*):
+            If set, the sequences will be padded to a multiple of this value.
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            Type of Tensor to return. Only `"pt"` is currently supported.
+
+    Examples
+    --------
+    ```python
+    >>> from trl.trainer.sft_trainer import DataCollatorForLanguageModeling
+
+    >>> collator = DataCollatorForLanguageModeling(pad_token_id=0)
+    >>> examples = [{"input_ids": [1, 2, 3]}, {"input_ids": [4, 5]}]
+    >>> collator(examples)
+    {'input_ids': tensor([[  1,  2,  3],
+                          [  4,  5,  0]]),
+     'attention_mask': tensor([[  1,  1,  1],
+                               [  1,  1,  0]]),
+     'labels': tensor([[   1,    2,    3],
+                       [   4,    5, -100]])}
+
+    >>> # With completion mask
+    >>> examples = [
+    ...     {"input_ids": [1, 2, 3], "completion_mask": [0, 1, 1]},
+    ...     {"input_ids": [4, 5], "completion_mask": [0, 1]},
+    ... ]
+    >>> collator(examples)
+    {'input_ids': tensor([[  1,  2,  3],
+                          [  4,  5,  0]]),
+     'attention_mask': tensor([[  1,  1,  1],
+                               [  1,  1,  0]]),
+     'labels': tensor([[-100,    2,    3],
+                       [-100,    5, -100]])}
+
+    >>> # With padding_free
+    >>> collator = DataCollatorForLanguageModeling(pad_token_id=0, padding_free=True)
+    >>> collator(examples)
+    {'input_ids': tensor([[ 1, 2, 3, 4, 5]]),
+     'position_ids': tensor([[0, 1, 2, 0, 1]]),
+     'labels': tensor([[1, 2, 3, 4, 5]])}
+    ```
+    """
+
+    pad_token_id: int
+    completion_only_loss: bool = True
+    padding_free: bool = False
+    pad_to_multiple_of: int | None = None
+    return_tensors: str = "pt"
+
+    def torch_call(self, examples: list[dict[str, Any]]) -> dict[str, Any]:
+        # Convert to tensor
+        input_ids = [torch.tensor(example["input_ids"]) for example in examples]
+        if "labels" in examples[0]:
+            labels = [torch.tensor(example["labels"]) for example in examples]
+        else:
+            labels = [torch.tensor(example["input_ids"]) for example in examples]
+
+        # For padding-free, we should NOT create attention_mask as it causes FlashAttention to ignore position_ids and
+        # compute wrong cu_seq_lens from the all-1s mask
+        if self.padding_free:
+            if "seq_lengths" in examples[0]:
+                position_ids = self.get_position_ids_from_packed_seq_lengths(
+                    [example["seq_lengths"] for example in examples]
+                )
+            else:
+                position_ids = [torch.arange(len(ids)) for ids in input_ids]
+        else:
+            attention_mask = [torch.ones_like(ids) for ids in input_ids]
+        if self.completion_only_loss and "completion_mask" in examples[0]:
+            completion_mask = [
+                torch.tensor(example["completion_mask"]) for example in examples
+            ]
+        if "assistant_masks" in examples[0]:
+            assistant_masks = [
+                torch.tensor(example["assistant_masks"]) for example in examples
+            ]
+
+        # If padding_free, flatten everything into a single sequence
+        output = {}
+        if self.padding_free:
+            input_ids = [torch.cat(input_ids, dim=0)]
+            labels = [torch.cat(labels, dim=0)]
+            position_ids = [torch.cat(position_ids, dim=0)]
+            if self.completion_only_loss and "completion_mask" in examples[0]:
+                completion_mask = [torch.cat(completion_mask, dim=0)]
+            if "assistant_masks" in examples[0]:
+                assistant_masks = [torch.cat(assistant_masks, dim=0)]
+
+        # Pad
+        output["input_ids"] = pad(
+            input_ids,
+            padding_value=self.pad_token_id,
+            padding_side="right",
+            pad_to_multiple_of=self.pad_to_multiple_of,
+        )
+        output["labels"] = pad(
+            labels,
+            padding_value=-100,
+            padding_side="right",
+            pad_to_multiple_of=self.pad_to_multiple_of,
+        )
+        if self.padding_free:
+            output["position_ids"] = pad(
+                position_ids,
+                padding_value=0,
+                padding_side="right",
+                pad_to_multiple_of=self.pad_to_multiple_of,
+            )
+            output["labels"][output["position_ids"] == 0] = -100
+        else:
+            output["attention_mask"] = pad(
+                attention_mask,
+                padding_value=0,
+                padding_side="right",
+                pad_to_multiple_of=self.pad_to_multiple_of,
+            )
+        if self.completion_only_loss and "completion_mask" in examples[0]:
+            completion_mask = pad(
+                completion_mask,
+                padding_value=0,
+                padding_side="right",
+                pad_to_multiple_of=self.pad_to_multiple_of,
+            )
+            output["labels"][
+                completion_mask == 0
+            ] = -100  # mask everything that is not in the completion
+        if "assistant_masks" in examples[0]:
+            assistant_masks = pad(
+                assistant_masks,
+                padding_value=0,
+                padding_side="right",
+                pad_to_multiple_of=self.pad_to_multiple_of,
+            )
+            output["labels"][assistant_masks == 0] = -100
+        return output
+
+    @staticmethod
+    def get_position_ids_from_packed_seq_lengths(
+        batch_seq_lengths: list[list[int]],
+    ) -> list[torch.Tensor]:
+        """
+        Get position IDs for packed sequences.
+
+        Args:
+            batch_seq_lengths (`list[list[int]]`):
+                A list of lists containing the lengths of each individual document in the packed batch.
+
+        Return:
+            `list[torch.Tensor]`:
+                A list of tensors containing the position IDs for each packed sequence.
+        """
+        # Get lengths per row
+        example_lengths = [sum(seq_lengths) for seq_lengths in batch_seq_lengths]
+        # Flat list of lengths
+        batch_seq_lengths = torch.tensor(
+            [
+                seq_length
+                for seq_lengths in batch_seq_lengths
+                for seq_length in seq_lengths
+            ]
+        )
+        position_ids = torch.ones(sum(example_lengths), dtype=batch_seq_lengths.dtype)
+        position_ids[0] = 0
+        # Reset position ids to 0 at the start of each sequence
+        position_ids[batch_seq_lengths[:-1].cumsum(0)] = -(batch_seq_lengths[:-1] - 1)
+        position_ids = position_ids.cumsum(0)
+        # Split back into one tensor per example
+        return list(position_ids.split(example_lengths))
+
+
+@dataclass
+class DataCollatorForVisionLanguageModeling(DataCollatorMixin):
+    """
+    Data collator for vision-language modeling tasks.
+
+    Unlike text-only datasets—where the collator typically receives pre-tokenized inputs ready for batching,
+    vision-language data processing involves converting images into pixel values. This conversion is disk-intensive,
+    making upfront preprocessing of the entire dataset impractical. Therefore, this collator performs tokenization and
+    image processing on-the-fly to efficiently prepare batches.
+
+    Each input example should be a dictionary containing at least:
+    - An `"images"` key holding the image data.
+    - [language modeling](#language-modeling) type: either a `"messages"` key for conversational inputs or a `"text"`
+      key for standard text inputs.
+    - [prompt-completion](#prompt-completion) type: keys `"prompt"` and `"completion"` for the prompt and completion.
+
+    The collator outputs a dictionary including:
+    - `"input_ids"`: Tensor of token IDs.
+    - `"attention_mask"`: Tensor indicating attention mask.
+    - `"pixel_values"`: Tensor representing image pixel values.
+    - `"labels"`: Tensor for training labels.
+
+    Additional keys may be present depending on the processor, such as `"image_grid_thw"`.
+
+    Args:
+        processor ([`~transformers.ProcessorMixin`]):
+            The processor used to tokenize text and process images. It must be a subclass of
+            [`~transformers.ProcessorMixin`] and include a `tokenizer` with a defined `pad_token_id`.
+        max_length (`int` or `None`, optional, defaults to `None`):
+            Maximum sequence length for input tokens. If `None`, no truncation is applied.
+        completion_only_loss (`bool`, *optional*, defaults to `False`):
+            Whether to compute loss only on the completion part of the sequence. When `True`, the labels for the prompt
+            part are set to -100. It requires the dataset type to be prompt-completion.
+        pad_to_multiple_of (`int` or `None`, optional, defaults to `None`):
+            If set, the sequences will be padded to a multiple of this value.
+        dataset_text_field (`str`, optional, defaults to `"text"`):
+            Name of the column that contains text data in the dataset. This parameter is only relevant for [standard
+            datasets format](dataset_formats#standard).
+        return_tensors (`str`, optional, defaults to `"pt"`):
+            The tensor type to return. Currently, only `"pt"` (PyTorch tensors) is supported.
+
+    Example:
+    ```python
+    >>> from trl.trainer.sft_trainer import DataCollatorForVisionLanguageModeling
+    >>> from transformers import AutoProcessor
+
+    >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+    >>> collator = DataCollatorForVisionLanguageModeling(processor)
+    >>> examples = [
+    ...     {"images": [Image.open("image_0.png")], "messages": [{"role": "user", "content": "What is this?"}]},
+    ...     {"images": [Image.open("image_1.png")], "messages": [{"role": "user", "content": "Describe this image."}]},
+    ... ]
+    >>> collator(examples)
+    {'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,  151645,    198,
+                           151644,    872,    198, 151652, 151655, 151655, 151655,  151655, 151653,   3838,    374,
+                              419,     30, 151645,    198],
+                          [151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,  151645,    198,
+                           151644,    872,    198, 151652, 151655, 151655, 151655,  151655, 151653,  74785,    419,
+                             2168,     13, 151645,    198]]),
+     'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                               [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
+     'pixel_values': tensor([[-0.9893,  0.1785,  1.5362,  ..., -0.0582,  0.8661, -0.2431],
+                             [-0.2302,  0.9522, -1.1061,  ...,  0.0555,  1.3354, -0.6412],
+                             [ 1.2150,  0.9084,  0.7041,  ...,  0.2404, -0.8403, -0.5133],
+                             ...,
+                             [ 0.6895,  0.2807,  0.2515,  ..., -0.2004, -1.2100,  0.0555],
+                             [ 0.8209, -0.9748,  1.5654,  ...,  1.6055, -0.4706,  0.5817],
+                             [-1.0915,  0.4559,  0.9230,  ...,  0.5106,  0.0982, -0.1720]]),
+     'image_grid_thw': tensor([[1, 4, 4],
+                               [1, 4, 4]]),
+     'labels': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,  151645,    198,
+                        151644,    872,    198, 151652, 151655, 151655, 151655,  151655, 151653,   3838,    374,
+                           419,     30, 151645,    198],
+                        [151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,  151645,    198,
+                         151644,    872,    198, 151652, 151655, 151655, 151655,  151655, 151653,  74785,    419,
+                           2168,     13, 151645,    198]])}
+    ```
+    """
+
+    processor: ProcessorMixin
+    max_length: int | None = None
+    completion_only_loss: bool = False  # default not used in practice; SFTTrainer always passes the relevant value
+    pad_to_multiple_of: int | None = None
+    dataset_text_field: str = "text"
+    return_tensors: str = "pt"
+
+    def torch_call(self, examples: list[dict[str, Any]]) -> dict[str, Any]:
+        if "messages" in examples[0] or self.dataset_text_field in examples[0]:
+            if self.completion_only_loss:
+                raise ValueError(
+                    "The `completion_only_loss` argument is not supported for language modeling datasets."
+                )
+            return self._collate_language_modeling(examples)
+        if "prompt" in examples[0] and "completion" in examples[0]:
+            return self._collate_prompt_completion(examples)
+        raise KeyError(
+            f"Unexpected input keys in examples: {list(examples[0].keys())}."
+        )
+
+    def _collate_language_modeling(
+        self, examples: list[dict[str, Any]]
+    ) -> dict[str, Any]:
+        images = [example["images"] for example in examples]
+        # Transformers requires at least one image in the batch, otherwise it throws an error
+        if all(img_list == [] for img_list in images):
+            images = None
+
+        if "messages" in examples[0]:  # conversational case
+            messages = [
+                prepare_multimodal_messages(example["messages"], example["images"])
+                for example in examples
+            ]
+            texts = self.processor.apply_chat_template(messages)
+        elif self.dataset_text_field in examples[0]:  # standard case
+            texts = [example[self.dataset_text_field] for example in examples]
+        else:
+            raise KeyError(
+                "The input examples must contain either 'messages' for conversational data or 'text' for standard "
+                "data."
+            )
+
+        output = self.processor(
+            images=images,
+            text=texts,
+            padding=True,
+            padding_side="right",
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            truncation=self.max_length is not None,
+            max_length=self.max_length,
+            return_tensors=self.return_tensors,
+            add_special_tokens=False,  # to avoid adding the BOS, twice see https://huggingface.co/blog/qgallouedec/gotchas-in-tokenizer-behavior#7-chat-template-and-tokenization-dont-compose-due-to-special-tokens
+        )
+        labels = output["input_ids"].clone()
+        labels[output["attention_mask"] == 0] = -100
+        # We mask only padding tokens (-100) in the labels. Vision tokens are left unchanged because their handling in
+        # loss computation has to be done by the model, and masking them here would be infeasible in practice as vision
+        # token definitions vary across architectures.
+        output["labels"] = labels
+        return output
+
+    def _collate_prompt_completion(
+        self, examples: list[dict[str, Any]]
+    ) -> dict[str, Any]:
+        if self.pad_to_multiple_of is not None:
+            raise NotImplementedError(
+                "Padding to a multiple of a value is not yet implemented for vision-language modeling and "
+                "prompt-completion data yet."
+            )
+        images = [example["images"] for example in examples]
+        # Transformers requires at least one image in the batch, otherwise it throws an error
+        if all(img_list == [] for img_list in images):
+            images = None
+        if is_conversational(examples[0]):  # conversational case
+            for example in examples:
+                example["prompt"] = prepare_multimodal_messages(
+                    example["prompt"], images=example["images"]
+                )
+                example["completion"] = prepare_multimodal_messages(
+                    example["completion"], images=[]
+                )
+            examples = [
+                apply_chat_template(example, self.processor) for example in examples
+            ]
+
+        prompts = [example["prompt"] for example in examples]
+        completions = [example["completion"] for example in examples]
+
+        processed_prompts = self.processor(
+            images=images,
+            text=prompts,
+            padding=True,
+            padding_side="left",
+            return_tensors=self.return_tensors,
+            add_special_tokens=False,  # to avoid adding the BOS, twice see https://huggingface.co/blog/qgallouedec/gotchas-in-tokenizer-behavior#7-chat-template-and-tokenization-dont-compose-due-to-special-tokens
+        )
+        processed_completions = self.processor(
+            text=completions,
+            padding=True,
+            padding_side="right",
+            return_tensors=self.return_tensors,
+            add_special_tokens=False,  # to avoid adding the BOS, twice see https://huggingface.co/blog/qgallouedec/gotchas-in-tokenizer-behavior#7-chat-template-and-tokenization-dont-compose-due-to-special-tokens
+        )
+
+        # Concatenate prompts and completions
+        prompt_ids, completion_ids = (
+            processed_prompts["input_ids"],
+            processed_completions["input_ids"],
+        )
+        prompt_mask, completion_mask = (
+            processed_prompts["attention_mask"],
+            processed_completions["attention_mask"],
+        )
+        input_ids = torch.cat((prompt_ids, completion_ids), dim=1)
+        attention_mask = torch.cat((prompt_mask, completion_mask), dim=1)
+        completion_mask = torch.cat(
+            (torch.zeros_like(prompt_mask), completion_mask), dim=1
+        )
+        if "token_type_ids" in processed_prompts:  # special case for Gemma
+            prompt_token_type_ids = processed_prompts["token_type_ids"]
+            completion_token_type_ids = processed_completions["token_type_ids"]
+            token_type_ids = torch.cat(
+                (prompt_token_type_ids, completion_token_type_ids), dim=1
+            )
+
+        # Flush left to reduce padding
+        if "token_type_ids" in processed_prompts:
+            attention_mask, input_ids, completion_mask, token_type_ids = flush_left(
+                attention_mask, input_ids, completion_mask, token_type_ids
+            )
+        else:
+            attention_mask, input_ids, completion_mask = flush_left(
+                attention_mask, input_ids, completion_mask
+            )
+
+        # Truncate if necessary
+        if self.max_length is not None:
+            input_ids = input_ids[:, : self.max_length]
+            attention_mask = attention_mask[:, : self.max_length]
+            completion_mask = completion_mask[:, : self.max_length]
+            if "token_type_ids" in processed_prompts:
+                token_type_ids = token_type_ids[:, : self.max_length]
+
+        # Create labels and mask padding tokens
+        labels = input_ids.clone()
+        labels[attention_mask == 0] = -100
+        if self.completion_only_loss:
+            labels[completion_mask == 0] = -100
+
+        # Build the output dictionary
+        output = processed_prompts  # we take processed_prompts because it contains the images
+        output["input_ids"] = input_ids
+        output["attention_mask"] = attention_mask
+        output["labels"] = labels
+        if "token_type_ids" in processed_prompts:
+            output["token_type_ids"] = token_type_ids
+        return output
+
+
+def dft_loss(outputs, labels, num_items_in_batch=None):
+    """
+    DFT loss function, as presented in [On the Generalization of SFT: A Reinforcement Learning Perspective with Reward
+    Rectification](https://huggingface.co/papers/2508.05629)
+    """
+    labels = nn.functional.pad(labels, (0, 1), value=-100)
+    shift_labels = labels[..., 1:].contiguous()
+    loss_mask = shift_labels != -100
+    shift_labels[~loss_mask] = 0
+    logprobs = selective_log_softmax(outputs.logits, shift_labels)
+    per_token_loss = -logprobs.exp().detach() * logprobs
+    if num_items_in_batch is None:
+        num_items_in_batch = loss_mask.sum()
+    loss = (per_token_loss * loss_mask).sum() / num_items_in_batch
+    return loss
+
+
+class SFTTrainer(BaseTrainer):
+    """
+    Trainer for Supervised Fine-Tuning (SFT) method.
+
+    This class is a wrapper around the [`~transformers.Trainer`] class and inherits all of its attributes and methods.
+
+    Example:
+
+    ```python
+    from datasets import load_dataset
+    from trl import SFTTrainer
+
+    dataset = load_dataset("roneneldan/TinyStories", split="train[:1%]")
+
+    trainer = SFTTrainer(model="Qwen/Qwen2-0.5B-Instruct", train_dataset=dataset)
+    trainer.train()
+    ```
+
+    Args:
+        model (`str | PreTrainedModel`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
+              path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+              using `<ModelArchitecture>.from_pretrained` (where `<ModelArchitecture>` is derived from the model
+              config) with the keyword arguments in `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object.
+            If you're training a model with an MoE architecture and want to include the load balancing/auxilliary loss
+            as a part of the final loss, remember to set the `output_router_logits` config of the model to `True`.
+        args ([`SFTConfig`], *optional*):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        data_collator ([`~transformers.DataCollator`], *optional*):
+            Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
+            Will default to [`~trainer.sft_trainer.DataCollatorForLanguageModeling`] if the model is a language model
+            and [`~trainer.sft_trainer.DataCollatorForVisionLanguageModeling`] if the model is a vision-language model.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. SFT supports both [language modeling](#language-modeling) type and
+            [prompt-completion](#prompt-completion) type. The format of the samples can be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+
+            The trainer also supports processed datasets (tokenized) as long as they contain an `input_ids` field.
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Dataset | IterableDataset]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If `None`, the processing class is loaded from the model's name
+            with [`~transformers.AutoProcessor.from_pretrained`]. A padding token, `tokenizer.pad_token`, must be set.
+            If the processing class has not set a padding token, `tokenizer.eos_token` will be used as the default.
+        compute_loss_func (`Callable`, *optional*):
+            A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated
+            batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss
+            function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618)
+            used by [`Trainer`].
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function that will be used to compute metrics at evaluation. Must take a
+            [`~transformers.EvalPrediction`] and return a dictionary string to metric values. When passing
+            [`SFTConfig`] with `batch_eval_metrics` set to `True`, your `compute_metrics` function must take a boolean
+            `compute_result` argument. This will be triggered after the last eval batch to signal that the function
+            needs to calculate and return the global summary statistics rather than accumulating the batch-level
+            statistics.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+            in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your
+            model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`.
+        optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*):
+            A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
+            `args`. Incompatible with the `optimizers` argument.
+
+            Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before
+            initializing the Trainer.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
+            A function that preprocess the logits right before caching them at each evaluation step. Must take two
+            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
+            by this function will be reflected in the predictions received by `compute_metrics`.
+
+            Note that the labels (second parameter) will be `None` if the dataset does not have them.
+        peft_config ([`~peft.PeftConfig`], *optional*):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+        formatting_func (`Callable`, *optional*):
+            Formatting function applied to the dataset before tokenization. Applying the formatting function explicitly
+            converts the dataset into a [language modeling](#language-modeling) type.
+    """
+
+    _tag_names = ["trl", "sft"]
+    _name = "SFT"
+
+    def __init__(
+        self,
+        model: str | PreTrainedModel,
+        args: SFTConfig | TrainingArguments | None = None,
+        data_collator: DataCollator | None = None,
+        train_dataset: Dataset | IterableDataset | None = None,
+        eval_dataset: Dataset | dict[str, Dataset] | None = None,
+        processing_class: PreTrainedTokenizerBase | ProcessorMixin | None = None,
+        compute_loss_func: Callable | None = None,
+        compute_metrics: Callable[[EvalPrediction], dict] | None = None,
+        callbacks: list[TrainerCallback] | None = None,
+        optimizers: tuple[
+            torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
+        ] = (None, None),
+        optimizer_cls_and_kwargs: tuple[type[torch.optim.Optimizer], dict[str, Any]]
+        | None = None,
+        preprocess_logits_for_metrics: Callable[
+            [torch.Tensor, torch.Tensor], torch.Tensor
+        ]
+        | None = None,
+        peft_config: "PeftConfig | None" = None,
+        formatting_func: Callable[[dict], str] | None = None,
+    ):
+        # Args
+        if args is None:
+            model_name = (
+                model if isinstance(model, str) else get_config_model_id(model.config)
+            )
+            model_name = model_name.split("/")[-1]
+            args = SFTConfig(f"{model_name}-SFT")
+        elif isinstance(args, TrainingArguments) and not isinstance(args, SFTConfig):
+            dict_args = args.to_dict()
+            dict_args["hub_token"] = args.hub_token  # to_dict hides the hub_token
+            dict_args.pop("push_to_hub_token")
+            args = SFTConfig(**dict_args)
+
+        # Model
+        if isinstance(model, str):
+            model = create_model_from_path(model, **args.model_init_kwargs or {})
+        elif args.model_init_kwargs is not None:
+            logger.warning(
+                "You passed `model_init_kwargs` to the `SFTConfig`, but your model is already instantiated. "
+                "The `model_init_kwargs` will be ignored."
+            )
+
+        # Processing class
+        if processing_class is None:
+            processing_class = AutoProcessor.from_pretrained(
+                get_config_model_id(model.config)
+            )
+
+        # Handle pad token for processors or tokenizers
+        if isinstance(processing_class, ProcessorMixin):
+            tokenizer = processing_class.tokenizer
+            self._is_vlm = True
+        elif isinstance(processing_class, PreTrainedTokenizerBase):
+            tokenizer = processing_class
+            self._is_vlm = False
+        else:
+            raise TypeError(
+                "The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`"
+            )
+
+        if args.eos_token is not None:
+            eos_token = args.eos_token
+            eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)
+            if eos_token_id is None:
+                raise ValueError(
+                    f"The specified `eos_token` ('{eos_token}') is not found in the vocabulary of the given "
+                    f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `eos_token` exists "
+                    "in the vocabulary before using it as an EOS token."
+                )
+            tokenizer.eos_token_id = eos_token_id
+
+        if args.chat_template_path is not None:
+            if os.path.isfile(
+                args.chat_template_path
+            ) and args.chat_template_path.endswith((".jinja", ".j2")):
+                with open(
+                    args.chat_template_path, encoding="utf-8"
+                ) as chat_template_file:
+                    processing_class.chat_template = chat_template_file.read()
+                added_tokens = []
+            else:
+                model, processing_class, added_tokens = clone_chat_template(
+                    model, processing_class, args.chat_template_path
+                )
+        else:
+            added_tokens = []
+
+        # Catch some wrong configurations related to VLMs
+        if self._is_vlm and args.packing:
+            raise ValueError(
+                "Packing is not supported for vision-language models. Please set `packing=False` in the SFTConfig."
+            )
+        if self._is_vlm and args.padding_free:
+            raise ValueError(
+                "Padding-free training is yet not supported for vision-language models. Please set "
+                "`padding_free=False` in the `SFTConfig`."
+            )
+        if self._is_vlm and args.assistant_only_loss:
+            raise ValueError(
+                "Assistant-only loss is not yet supported for vision-language models. Please set "
+                "`assistant_only_loss=False` in the `SFTConfig`."
+            )
+
+        # PEFT configuration and model wrapping
+        if peft_config is not None:
+            if added_tokens:
+                # Ensure that the added tokens are trainable
+                if peft_config.trainable_token_indices is None:
+                    peft_config.trainable_token_indices = {"embed_tokens": added_tokens}
+                elif "embed_tokens" not in peft_config.trainable_token_indices:
+                    peft_config.trainable_token_indices["embed_tokens"] = added_tokens
+                else:
+                    peft_config.trainable_token_indices["embed_tokens"].extend(
+                        added_tokens
+                    )
+
+                # Ensure that the lm_head is trainable
+                if (
+                    peft_config.modules_to_save is None
+                    or "lm_head" not in peft_config.modules_to_save
+                ):
+                    logger.warning(
+                        "Cloning chat template added new tokens to the tokenizer, but 'lm_head' is not in PEFT's "
+                        "`modules_to_save`. As a result, the model may not learn to generate outputs with these new "
+                        "tokens, leading to degraded generation quality. To fix this, add "
+                        "`modules_to_save=['lm_head']` to your PEFT configuration."
+                    )
+
+                    if peft_config.modules_to_save is None:
+                        peft_config.modules_to_save = ["lm_head"]
+                    else:
+                        peft_config.modules_to_save.append("lm_head")
+
+        # In Prompt Tuning a small set of trainable virtual tokens (continuous prompt embeddings) is prepended to the
+        # input. We store the number of these tokens so we can account for them correctly when calculating accuracy.
+        self.num_virtual_tokens = 0
+
+        if peft_config is not None or (
+            is_peft_available() and isinstance(model, PeftModel)
+        ):
+            model = prepare_peft_model(model, peft_config, args)
+            if model.active_adapter in model.peft_config:
+                peft_model_config = model.peft_config[model.active_adapter]
+                self.num_virtual_tokens = getattr(
+                    peft_model_config, "num_virtual_tokens", 0
+                )
+
+        # Data collator
+        # BFD packing requires padding-free mode; otherwise, the collator outputs padded attention masks, causing
+        # FlashAttention to ignore position_ids and recompute them incorrectly from the padded attention mask.
+        self.padding_free = args.padding_free or (
+            args.packing and args.packing_strategy == "bfd"
+        )
+        use_flash_attention = (
+            model.config._attn_implementation in FLASH_ATTENTION_VARIANTS
+        )
+        if self.padding_free:
+            if data_collator is not None:
+                raise ValueError(
+                    "Passing a custom data collator is not supported when using padding-free."
+                )
+            if args.packing and args.packing_strategy == "wrapped":
+                logger.warning(
+                    "You are passing `padding_free=True` with the 'wrapped' packing strategy, which is not "
+                    "recommended. Please refer to the documentation to understand why this is not recommended."
+                )
+            if not use_flash_attention:
+                logger.warning(
+                    "Padding-free training is enabled, but the attention implementation is not set to a supported "
+                    "flash attention variant. Padding-free training flattens batches into a single sequence, and only "
+                    "the following implementations are known to reliably support this: "
+                    f"{', '.join(sorted(FLASH_ATTENTION_VARIANTS))}. Using other implementations may lead to "
+                    "unexpected behavior. To ensure compatibility, set `attn_implementation` in the model "
+                    "configuration to one of these supported options or verify that your attention mechanism can "
+                    "handle flattened sequences."
+                )
+
+            if args.per_device_train_batch_size == 1 and not args.packing:
+                logger.warning(
+                    "You are using a per_device_train_batch_size of 1 with padding-free training. Using a batch size "
+                    "of 1 anihilate the benefits of padding-free training. Please consider increasing the batch size "
+                    "to at least 2."
+                )
+
+        # Decide whether to use completion-only loss: if not specified, then it is set to True if the dataset format
+        # is prompt-completion, and False if the dataset format is language modeling.
+        dataset_sample = next(iter(train_dataset))
+        if args.completion_only_loss is None:
+            self.completion_only_loss = (
+                "prompt" in dataset_sample and "completion" in dataset_sample
+            )
+        else:
+            self.completion_only_loss = args.completion_only_loss
+
+        self._is_vision_dataset = (
+            "image" in dataset_sample or "images" in dataset_sample
+        )
+        if self._is_vision_dataset and not self._is_vlm:
+            raise ValueError(
+                "The dataset appears to be vision-related (contains 'image' or 'images' keys), but the provided "
+                "model does not seem to be a vision-language model. Please check your model and dataset."
+            )
+
+        if data_collator is None and not self._is_vision_dataset:
+            # Get the pad token: if not provided, use the one from the processing class or the eos token
+            # if the processing class does not have a pad token.
+            pad_token = args.pad_token or tokenizer.pad_token or tokenizer.eos_token
+            pad_token_id = tokenizer.convert_tokens_to_ids(pad_token)
+            if pad_token_id is None:
+                raise ValueError(
+                    f"The specified `pad_token` ('{pad_token}') is not found in the vocabulary of the given "
+                    f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `pad_token` exists "
+                    "in the vocabulary before using it as a padding token."
+                )
+            data_collator = DataCollatorForLanguageModeling(
+                pad_token_id=pad_token_id,
+                completion_only_loss=self.completion_only_loss,
+                padding_free=self.padding_free,
+                pad_to_multiple_of=args.pad_to_multiple_of,
+            )
+        elif data_collator is None and self._is_vision_dataset:
+            data_collator = DataCollatorForVisionLanguageModeling(
+                processor=processing_class,
+                max_length=args.max_length,
+                completion_only_loss=self.completion_only_loss,
+                pad_to_multiple_of=args.pad_to_multiple_of,
+                dataset_text_field=args.dataset_text_field,
+            )
+
+        if args.packing and args.packing_strategy == "bfd" and not use_flash_attention:
+            logger.warning(
+                "You are using packing, but the attention implementation is not set to a supported flash attention "
+                "variant. Packing gathers multiple samples into a single sequence, and only the following "
+                f"implementations are known to reliably support this: {', '.join(sorted(FLASH_ATTENTION_VARIANTS))}. "
+                "Using other implementations may lead to cross-contamination between samples. To avoid this, either "
+                "disable packing by setting `packing=False`, or set `attn_implementation` in the model configuration "
+                "to one of these supported options."
+            )
+        if args.assistant_only_loss and not is_conversational(dataset_sample):
+            raise ValueError(
+                "You set `assistant_only_loss=True`, but the dataset is not conversational. This option is only "
+                "supported for conversational datasets."
+            )
+
+        # Dataset
+        # Skip dataset preparation if `skip_prepare_dataset=True` in `dataset_kwargs`, or if it's a VLM, where
+        # preprocessing (e.g., image-to-pixel conversion) is too costly and done on the fly instead.
+        skip_prepare_dataset = (
+            args.dataset_kwargs is not None
+            and args.dataset_kwargs.get("skip_prepare_dataset", False)
+            or self._is_vision_dataset
+        )
+        if not skip_prepare_dataset:
+            if self.completion_only_loss and formatting_func:
+                raise ValueError(
+                    "A formatting function was provided while `completion_only_loss=True`, which is incompatible. "
+                    "Using a formatter converts the dataset to a language modeling type, conflicting with "
+                    "completion-only loss. To resolve this, apply your formatting function before passing the "
+                    "dataset, or disable `completion_only_loss` in `SFTConfig`."
+                )
+            train_dataset = self._prepare_dataset(
+                train_dataset,
+                processing_class,
+                args,
+                args.packing,
+                formatting_func,
+                "train",
+            )
+            if eval_dataset is not None:
+                packing = (
+                    args.packing if args.eval_packing is None else args.eval_packing
+                )
+                if isinstance(eval_dataset, dict):
+                    eval_dataset = {
+                        key: self._prepare_dataset(
+                            dataset,
+                            processing_class,
+                            args,
+                            packing,
+                            formatting_func,
+                            key,
+                        )
+                        for key, dataset in eval_dataset.items()
+                    }
+                else:
+                    eval_dataset = self._prepare_dataset(
+                        eval_dataset,
+                        processing_class,
+                        args,
+                        packing,
+                        formatting_func,
+                        "eval",
+                    )
+
+        # Loss function
+        if args.loss_type == "nll":
+            pass  # use the default loss
+        elif args.loss_type == "dft":
+            if compute_loss_func is not None:
+                raise ValueError(
+                    "You passed a `compute_loss_func` together with `loss_type='dft'` to the `SFTTrainer`. "
+                    "When using `loss_type='dft'`, the loss function is internally set to the DFT loss, so passing a "
+                    "`compute_loss_func` is not allowed."
+                )
+            compute_loss_func = dft_loss
+        else:
+            raise ValueError(
+                f"Invalid `loss_type` {args.loss_type} passed. Supported values are 'nll' and 'dft'."
+            )
+
+        # Initialize the metrics
+        self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)}
+        self._total_train_tokens = 0
+
+        # Initialize the Trainer. Parent class will handle:
+        # - DeepSpeed configuration (through create_accelerator_and_postprocess)
+        # - FSDP setup
+        # - Distributed training setup
+        # - Optimizer and scheduler creation
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            compute_loss_func=compute_loss_func,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Initialize activation offloading context
+        if self.args.activation_offloading:
+            self.maybe_activation_offload_context = get_act_offloading_ctx_manager(
+                model=self.model
+            )
+        else:
+            self.maybe_activation_offload_context = contextlib.nullcontext()
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        self.aux_loss_enabled = getattr(model.config, "output_router_logits", False)
+
+    def _prepare_dataset(
+        self,
+        dataset: Dataset | IterableDataset,
+        processing_class: PreTrainedTokenizerBase
+        | BaseImageProcessor
+        | FeatureExtractionMixin
+        | ProcessorMixin,
+        args: SFTConfig,
+        packing: bool,
+        formatting_func: Callable[[dict], str] | None,
+        dataset_name: str,
+    ) -> Dataset | IterableDataset:
+        # Tabular backends like Arrow/Parquet insert `None` for mismatched keys in nested structures. Clean them from
+        # sampled data.
+        if isinstance(
+            dataset, Dataset
+        ):  # IterableDataset does not support `with_transform`
+            dataset = dataset.with_transform(remove_none_values)
+
+        # If the dataset is already preprocessed (tokenized), skip the processing steps.
+        column_names = get_dataset_column_names(dataset)
+        is_processed = "input_ids" in column_names
+
+        # Build the kwargs for the `map` function
+        map_kwargs = {}
+        if isinstance(dataset, Dataset):  # IterableDataset does not support num_proc
+            map_kwargs["num_proc"] = args.dataset_num_proc
+
+        with PartialState().main_process_first():
+            # Apply the formatting function if any
+            if formatting_func is not None and is_processed:
+                logger.warning(
+                    "You passed a dataset that is already processed (contains an `input_ids` field) together with a "
+                    "formatting function. Therefore `formatting_func` will be ignored. Either remove the "
+                    "`formatting_func` or pass a dataset that is not already processed.",
+                )
+
+            if formatting_func is not None and not is_processed:
+                if isinstance(
+                    dataset, Dataset
+                ):  # `IterableDataset.map` does not support `desc`
+                    map_kwargs["desc"] = (
+                        f"Applying formatting function to {dataset_name} dataset"
+                    )
+
+                def _func(example):
+                    return {"text": formatting_func(example)}
+
+                dataset = dataset.map(_func, batched=False, **map_kwargs)
+
+            if not is_processed:
+                # Convert the dataset to ChatML if needed
+                first_example = next(iter(dataset))
+                if is_conversational_from_value(first_example):
+                    if isinstance(
+                        dataset, Dataset
+                    ):  # `IterableDataset.map` does not support `desc`
+                        map_kwargs["desc"] = (
+                            f"Converting {dataset_name} dataset to ChatML"
+                        )
+                    column_names = get_dataset_column_names(dataset)
+                    dataset = dataset.map(
+                        maybe_convert_to_chatml,
+                        remove_columns="conversations"
+                        if "conversations" in column_names
+                        else None,
+                        **map_kwargs,
+                    )
+
+                # Apply the chat template if needed
+                first_example = next(iter(dataset))
+                if not is_conversational(first_example):
+                    if isinstance(
+                        dataset, Dataset
+                    ):  # `IterableDataset.map` does not support `desc`
+                        map_kwargs["desc"] = f"Adding EOS to {dataset_name} dataset"
+
+                    def add_eos(example, eos_token):
+                        if "text" in example and not example["text"].endswith(
+                            eos_token
+                        ):  # language modeling case
+                            example["text"] = example["text"] + eos_token
+                        elif "completion" in example and not example[
+                            "completion"
+                        ].endswith(eos_token):
+                            example["completion"] = example["completion"] + eos_token
+                        return example
+
+                    dataset = dataset.map(
+                        add_eos,
+                        fn_kwargs={"eos_token": processing_class.eos_token},
+                        remove_columns="messages"
+                        if "messages" in column_names
+                        else None,  # renamed to "text"
+                        **map_kwargs,
+                    )
+
+                # Tokenize the dataset
+                if isinstance(
+                    dataset, Dataset
+                ):  # `IterableDataset.map` does not support `desc`
+                    map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
+
+                def tokenize_fn(
+                    example, processing_class, dataset_text_field, assistant_only_loss
+                ):
+                    if "prompt" in example:  # prompt-completion case
+                        output = {}
+                        if is_conversational(example):
+                            if self._is_vlm:
+                                prompt = prepare_multimodal_messages(
+                                    example["prompt"], images=[]
+                                )
+                                completion = prepare_multimodal_messages(
+                                    example["completion"], images=[]
+                                )
+                            else:
+                                prompt = example["prompt"]
+                                completion = example["completion"]
+                            prompt_ids = processing_class.apply_chat_template(
+                                prompt,
+                                tokenize=True,
+                                add_generation_prompt=True,
+                                tools=example.get("tools"),
+                                **example.get("chat_template_kwargs", {}),
+                            )
+                            # Fix transformers inconsistency: for VLMs, apply_chat_template returns lists of lists
+                            # even for single examples, while for LLMs it returns lists of ints.
+                            prompt_ids = (
+                                prompt_ids[0]
+                                if isinstance(prompt_ids[0], list)
+                                else prompt_ids
+                            )
+                            prompt_completion_processed = (
+                                processing_class.apply_chat_template(
+                                    prompt + completion,
+                                    return_dict=True,
+                                    tokenize=True,
+                                    return_assistant_tokens_mask=assistant_only_loss,
+                                    tools=example.get("tools"),
+                                    **example.get("chat_template_kwargs", {}),
+                                )
+                            )
+                            # Fix transformers inconsistency: for VLMs, apply_chat_template returns lists of lists
+                            # even for single examples, while for LLMs it returns lists of ints.
+                            prompt_completion_processed = {
+                                k: v[0] if isinstance(v[0], list) else v
+                                for k, v in prompt_completion_processed.items()
+                            }
+                            prompt_completion_ids = prompt_completion_processed[
+                                "input_ids"
+                            ]
+                            if "assistant_masks" in prompt_completion_processed:
+                                output["assistant_masks"] = prompt_completion_processed[
+                                    "assistant_masks"
+                                ]
+                        else:
+                            prompt_ids = processing_class(text=example["prompt"])[
+                                "input_ids"
+                            ]
+                            prompt_completion_ids = processing_class(
+                                text=example["prompt"] + example["completion"]
+                            )["input_ids"]
+
+                        # Check if the tokenized prompt starts with the tokenized prompt+completion
+                        if not prompt_completion_ids[: len(prompt_ids)] == prompt_ids:
+                            logger.warning(
+                                "Mismatch between tokenized prompt and the start of tokenized prompt+completion. "
+                                "This may be due to unexpected tokenizer behavior, whitespace issues, or special "
+                                "token handling. Verify that the tokenizer is processing text consistently."
+                            )
+
+                        # Create completion mask
+                        completion_mask = [0] * len(prompt_ids) + [1] * (
+                            len(prompt_completion_ids) - len(prompt_ids)
+                        )
+                        output["input_ids"] = prompt_completion_ids
+                        output["completion_mask"] = completion_mask
+
+                    elif is_conversational(example):
+                        if self._is_vlm:
+                            messages = prepare_multimodal_messages(
+                                example["messages"], images=[]
+                            )
+                        else:
+                            messages = example["messages"]
+                        processed = processing_class.apply_chat_template(
+                            messages,
+                            return_dict=True,
+                            tokenize=True,
+                            return_assistant_tokens_mask=assistant_only_loss,
+                            tools=example.get("tools"),
+                            **example.get("chat_template_kwargs", {}),
+                        )
+                        # Fix transformers inconsistency: for VLMs, apply_chat_template returns lists of lists
+                        # even for single examples, while for LLMs it returns lists of ints.
+                        processed = {
+                            k: v[0] if isinstance(v[0], list) else v
+                            for k, v in processed.items()
+                        }
+                        output = {
+                            k: processed[k]
+                            for k in ("input_ids", "assistant_masks")
+                            if k in processed
+                        }
+                    else:
+                        output = {
+                            "input_ids": processing_class(
+                                text=example[dataset_text_field]
+                            )["input_ids"]
+                        }
+
+                    if (
+                        "assistant_masks" in output
+                        and 1 not in output["assistant_masks"]
+                    ):
+                        raise RuntimeError(
+                            "You're using `assistant_only_loss=True`, but at least one example has no assistant "
+                            "tokens. This usually means the tokenizer's chat template doesn't generate assistant "
+                            "masks — it may be missing the `{% generation %}` keyword. Please check the template and "
+                            "ensure it's correctly configured to support assistant masking."
+                        )
+                    return output
+
+                dataset = dataset.map(
+                    tokenize_fn,
+                    fn_kwargs={
+                        "processing_class": processing_class,
+                        "dataset_text_field": args.dataset_text_field,
+                        "assistant_only_loss": args.assistant_only_loss,
+                    },
+                    **map_kwargs,
+                )
+
+            # Pack or truncate
+            if packing:
+                if args.max_length is None:
+                    raise ValueError(
+                        "When packing is enabled, `max_length` can't be `None`."
+                    )
+                if isinstance(
+                    dataset, Dataset
+                ):  # `IterableDataset.map` does not support `desc`
+                    map_kwargs["desc"] = f"Packing {dataset_name} dataset"
+
+                columns = ["input_ids"]
+                if "completion_mask" in get_dataset_column_names(dataset):
+                    columns.append("completion_mask")
+                if "assistant_masks" in get_dataset_column_names(dataset):
+                    columns.append("assistant_masks")
+
+                dataset = dataset.select_columns(columns)
+
+                # Packing adds new column "seq_lengths" needed for document aware FlashAttention
+                dataset = pack_dataset(
+                    dataset, args.max_length, args.packing_strategy, map_kwargs
+                )
+            elif args.max_length is not None:
+                if isinstance(
+                    dataset, Dataset
+                ):  # `IterableDataset.map` does not support `desc`
+                    map_kwargs["desc"] = f"Truncating {dataset_name} dataset"
+                dataset = truncate_dataset(dataset, args.max_length, map_kwargs)
+            # For Liger kernel, ensure only the essential columns
+            if args.use_liger_kernel:
+                collator_expected_keys = {
+                    "input_ids",
+                    "seq_lengths",
+                    "completion_mask",
+                    "assistant_masks",
+                }
+                column_names = get_dataset_column_names(dataset)
+                dataset = dataset.select_columns(
+                    collator_expected_keys.intersection(column_names)
+                )
+
+        return dataset
+
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs (usually, "input_ids"
+        # and "attention_mask"). When using `train_on_completion_only` we add a "completion_mask" column to the
+        # dataset. So we need to override the default signature columns to include "completion_mask" as well.
+        if self._signature_columns is None:
+            if self._is_vision_dataset:
+                self._signature_columns = ["messages", "prompt", "completion", "images"]
+            else:
+                self._signature_columns = [
+                    "input_ids",
+                    "labels",
+                    "seq_lengths",
+                    "completion_mask",
+                    "assistant_masks",
+                ]
+
+    def compute_loss(
+        self,
+        model: nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        return_outputs: bool = False,
+        num_items_in_batch: torch.Tensor | None = None,
+    ):
+        """
+        Compute training loss and additionally compute token accuracies
+        """
+        mode = "train" if self.model.training else "eval"
+
+        # Set aside labels as it will be dropped by super().compute_loss() if a custom `compute_loss_func` is used.
+        # This can be removed when this issue is fixed.
+        labels = inputs["labels"]
+
+        # If not set, defaults from model config and may warn since cache isn't compatible with gradient checkpointing
+        inputs["use_cache"] = False
+        (loss, outputs) = super().compute_loss(
+            model, inputs, return_outputs=True, num_items_in_batch=num_items_in_batch
+        )
+
+        # Compute entropy
+        if not self.args.use_liger_kernel:  # liger doesn't return logits
+            with torch.no_grad():
+                per_token_entropy = entropy_from_logits(outputs.logits)
+                # When using Prompt Tuning, skip the virtual tokens in logits before entropy computation, since they
+                # do not correspond to actual input tokens.
+                if (
+                    self.num_virtual_tokens > 0
+                    and model.peft_config[model.active_adapter].peft_type
+                    != PeftType.PREFIX_TUNING
+                ):
+                    per_token_entropy = per_token_entropy[:, self.num_virtual_tokens :]
+                if "attention_mask" in inputs:
+                    attention_mask = inputs["attention_mask"]
+                    entropy = (
+                        torch.sum(per_token_entropy * attention_mask)
+                        / attention_mask.sum()
+                    )
+                elif "position_ids" in inputs:
+                    entropy = torch.mean(per_token_entropy)
+                else:
+                    raise ValueError(
+                        "Expected 'attention_mask' or 'position_ids' in inputs."
+                    )
+                entropy = self.accelerator.gather_for_metrics(entropy).mean().item()
+            self._metrics[mode]["entropy"].append(entropy)
+
+        if mode == "train":
+            # When using padding-free, the attention_mask is not present in the inputs, instead we have cu_seq_lens_q,
+            # cu_seq_lens_k, and max_length_k, max_length_q and position_ids.
+            if "attention_mask" in inputs:
+                num_tokens_in_batch = (
+                    self.accelerator.gather_for_metrics(inputs["attention_mask"].sum())
+                    .sum()
+                    .item()
+                )
+            elif "position_ids" in inputs:
+                local_num_tokens = torch.tensor(
+                    inputs["position_ids"].size(1), device=inputs["position_ids"].device
+                )
+                num_tokens_in_batch = (
+                    self.accelerator.gather_for_metrics(local_num_tokens).sum().item()
+                )
+            else:
+                raise ValueError(
+                    "Expected 'attention_mask' or 'position_ids' in inputs."
+                )
+            self._total_train_tokens += num_tokens_in_batch
+        self._metrics[mode]["num_tokens"] = [self._total_train_tokens]
+
+        # Compute token accuracy if we have labels and if the model is not using Liger (no logits)
+        if not self.args.use_liger_kernel:
+            with torch.no_grad():
+                if "shift_labels" in inputs:
+                    # When using CP, labels are pre-shifted. We must use these (and cannot manually shift) because:
+                    # - The first discarded token from inputs["labels"] actually belongs to process n-1
+                    # - The last logits require the label from process n+1
+                    shift_logits = outputs.logits.contiguous()
+                    shift_labels = inputs["shift_labels"]
+                else:
+                    shift_logits = outputs.logits[..., :-1, :].contiguous()
+                    shift_labels = labels[..., 1:].contiguous()
+
+                # Prompt Tuning and P-Tuning output logits for virtual tokens but Prefix-Tuning does not.
+                if (
+                    self.num_virtual_tokens > 0
+                    and model.peft_config[model.active_adapter].peft_type
+                    != PeftType.PREFIX_TUNING
+                ):
+                    shift_logits = shift_logits[:, self.num_virtual_tokens :, :]
+
+                # Get predictions
+                predictions = shift_logits.argmax(dim=-1)
+
+                # Create mask for non-padding tokens (assuming ignore_index is -100)
+                mask = shift_labels != -100
+
+                # Calculate accuracy only on non-padding tokens
+                correct_predictions = (predictions == shift_labels) & mask
+                total_tokens = mask.sum()
+                correct_tokens = correct_predictions.sum()
+
+                # Gather the correct_tokens and total_tokens across all processes
+                correct_tokens = self.accelerator.gather_for_metrics(correct_tokens)
+                total_tokens = self.accelerator.gather_for_metrics(total_tokens)
+
+                # Compute the mean token accuracy and log it
+                total_sum = total_tokens.sum()
+                accuracy = (
+                    (correct_tokens.sum() / total_sum).item() if total_sum > 0 else 0.0
+                )
+                self._metrics[mode]["mean_token_accuracy"].append(accuracy)
+                if self.aux_loss_enabled:
+                    aux_loss = outputs.aux_loss
+                    aux_loss = (
+                        self.accelerator.gather_for_metrics(aux_loss).mean().item()
+                    )
+                    self._metrics[mode]["aux_loss"].append(aux_loss)
+
+        return (loss, outputs) if return_outputs else loss
+
+    # Override training step to add activation offloading context.
+    def training_step(self, *args, **kwargs):
+        with self.maybe_activation_offload_context:
+            return super().training_step(*args, **kwargs)
+
+    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
+        mode = "train" if self.model.training else "eval"
+        metrics = {
+            key: sum(val) / len(val) for key, val in self._metrics[mode].items()
+        }  # average the metrics
+
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if mode == "eval":
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+
+        logs.update(metrics)
+        super().log(logs, start_time)
+        self._metrics[mode].clear()
+
+    # Ensure the model card is saved along with the checkpoint
+    def _save_checkpoint(self, model, trial):
+        if self.args.hub_model_id is None:
+            model_name = Path(self.args.output_dir).name
+        else:
+            model_name = self.args.hub_model_id.split("/")[-1]
+        self.create_model_card(model_name=model_name)
+        super()._save_checkpoint(model, trial)
diff --git a/src/aixpert/training/training/trl/trainer/utils.py b/src/aixpert/training/training/trl/trainer/utils.py
new file mode 100644
index 0000000..3ab392d
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/utils.py
@@ -0,0 +1,2207 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import importlib.resources as pkg_resources
+import json
+import os
+import random
+import socket
+import warnings
+from collections.abc import Mapping, Sequence, Sized
+from dataclasses import dataclass, field
+from importlib.metadata import version
+from itertools import accumulate
+from typing import Any, Literal, TypeVar
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+import torch.utils.data
+import transformers
+from accelerate import Accelerator, PartialState, logging
+from accelerate.state import AcceleratorState
+from huggingface_hub import ModelCard, ModelCardData
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Sampler
+from transformers import (
+    AutoConfig,
+    BitsAndBytesConfig,
+    EvalPrediction,
+    GenerationConfig,
+    PretrainedConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    TrainerState,
+    TrainingArguments,
+    is_comet_available,
+)
+from transformers.utils import (
+    ModelOutput,
+    is_peft_available,
+    is_rich_available,
+    is_torch_mlu_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+)
+
+from ..trainer.model_config import ModelConfig
+
+
+if is_rich_available():
+    from rich.console import Console
+    from rich.panel import Panel
+    from rich.table import Table
+    from rich.text import Text
+
+if is_comet_available():
+    import comet_ml
+
+if is_peft_available():
+    from peft import LoraConfig, PeftConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class DataCollatorForChatML:
+    """
+    Data collator for ChatML format datasets.
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    ignore_index: int = -100
+    max_length: int = None
+    prompt_key: str = "prompt"
+    messages_key: str = "messages"
+
+    def __post_init__(self):
+        if self.tokenizer.pad_token_id is None:
+            raise ValueError(
+                "The tokenizer does not have a pad token. Please set `pad_token_id` in the tokenizer."
+            )
+        if self.max_length is None:
+            # set a sensible default
+            self.max_length = min(self.tokenizer.model_max_length, 1024)
+
+    def __call__(self, examples: list[dict[str, Any]]) -> dict[str, torch.Tensor]:
+        input_ids = []
+        attention_mask = []
+        prompts_input_ids = []
+        prompt_attention_mask = []
+        labels = []
+
+        for example in examples:
+            formatted_prompt = example.get(self.prompt_key, None)
+            if formatted_prompt is None:
+                prompt = example[self.messages_key][:-1]
+                formatted_prompt = self.tokenizer.apply_chat_template(
+                    prompt, tokenize=False, add_generation_prompt=True
+                )
+
+            if "input_ids" not in example:
+                message = example[self.messages_key]
+                formatted_message = self.tokenizer.apply_chat_template(
+                    message, tokenize=False, add_generation_prompt=False
+                )
+
+                tokenized_message = self.tokenizer(
+                    formatted_message,
+                    truncation=False,
+                    padding=False,
+                    return_tensors=None,
+                    add_special_tokens=False,
+                    return_offsets_mapping=True,
+                )
+                message_input_ids_full = tokenized_message["input_ids"]
+                offsets = tokenized_message.get("offset_mapping")
+
+                if offsets is not None:
+                    prompt_char_len = len(formatted_prompt)
+                    completion_start_idx_full = next(
+                        (
+                            idx
+                            for idx, (start, _) in enumerate(offsets)
+                            if start >= prompt_char_len
+                        ),
+                        len(message_input_ids_full),
+                    )
+                else:
+                    tokenized_prompt_full = self.tokenizer(
+                        formatted_prompt,
+                        truncation=False,
+                        padding=False,
+                        return_tensors=None,
+                        add_special_tokens=False,
+                    )
+                    completion_start_idx_full = len(tokenized_prompt_full["input_ids"])
+
+                prompt_tokens_full = message_input_ids_full[:completion_start_idx_full]
+                completion_input_ids_full = message_input_ids_full[
+                    completion_start_idx_full:
+                ]
+
+                if (
+                    self.max_length is not None
+                    and len(message_input_ids_full) > self.max_length
+                ):
+                    completion_ids = completion_input_ids_full
+                    if len(completion_ids) >= self.max_length:
+                        completion_ids = completion_ids[-self.max_length :]
+                        prompt_ids = []
+                    else:
+                        max_prompt_tokens = self.max_length - len(completion_ids)
+                        prompt_ids = (
+                            prompt_tokens_full[-max_prompt_tokens:]
+                            if max_prompt_tokens > 0
+                            else []
+                        )
+                    message_input_ids = prompt_ids + completion_ids
+                else:
+                    message_input_ids = message_input_ids_full
+                    prompt_ids = prompt_tokens_full
+
+                input_ids.append(message_input_ids)
+                attention_mask.append([1] * len(message_input_ids))
+                current_prompt_ids = prompt_ids
+            else:
+                message_input_ids = example["input_ids"]
+                input_ids.append(message_input_ids)
+                if "attention_mask" in example:
+                    attention_mask.append(example["attention_mask"])
+                else:
+                    attention_mask.append([1] * len(message_input_ids))
+
+                tokenized_prompt = self.tokenizer(
+                    formatted_prompt,
+                    truncation=True,
+                    max_length=len(message_input_ids),
+                    padding=False,
+                    return_tensors=None,
+                    add_special_tokens=False,
+                )
+                current_prompt_ids = tokenized_prompt["input_ids"]
+
+            prompts_input_ids.append(current_prompt_ids)
+            prompt_attention_mask.append([1] * len(current_prompt_ids))
+
+            label = [self.ignore_index] * len(input_ids[-1])
+            completion_start_idx = len(current_prompt_ids)
+            label[completion_start_idx:] = input_ids[-1][completion_start_idx:]
+            labels.append(label)
+
+        # convert to list of tensors and pad
+        input_ids = [torch.tensor(ids, dtype=torch.long) for ids in input_ids]
+        attention_mask = [
+            torch.tensor(mask, dtype=torch.long) for mask in attention_mask
+        ]
+        labels = [torch.tensor(label, dtype=torch.long) for label in labels]
+        input_ids = pad(
+            input_ids, padding_side="left", padding_value=self.tokenizer.pad_token_id
+        )
+        attention_mask = pad(attention_mask, padding_side="left", padding_value=0)
+        labels = pad(labels, padding_side="left", padding_value=self.ignore_index)
+
+        prompts_input_ids = [
+            torch.tensor(ids, dtype=torch.long) for ids in prompts_input_ids
+        ]
+        prompt_attention_mask = [
+            torch.tensor(mask, dtype=torch.long) for mask in prompt_attention_mask
+        ]
+        prompts_input_ids = pad(
+            prompts_input_ids,
+            padding_side="left",
+            padding_value=self.tokenizer.pad_token_id,
+        )
+        prompt_attention_mask = pad(
+            prompt_attention_mask, padding_side="left", padding_value=0
+        )
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            "prompts": prompts_input_ids,
+            "prompt_attention_mask": prompt_attention_mask,
+        }
+
+
+def _is_port_free(port: int, host: str = "127.0.0.1") -> bool:
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            s.bind((host, port))
+            return True
+    except OSError:
+        return False
+
+
+def _find_free_port() -> int:
+    candidates = (29500, 23456, 12355, 12345)
+    for p in candidates:
+        if _is_port_free(p):
+            return p
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
+
+
+def ensure_master_addr_port(addr: str | None = None, port: int | None = None) -> None:
+    """
+    Ensure `MASTER_ADDR`/`MASTER_PORT` are set safely.
+
+    - Respects existing environment variables.
+    - Defaults `MASTER_ADDR` to localhost if unset.
+    - Chooses a free TCP port if `MASTER_PORT` is unset to avoid collisions.
+    - If `MASTER_PORT` is set to `"0"` or `"auto"`, it is resolved to a free port.
+    """
+    os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR") or addr or "localhost"
+
+    env_port = os.environ.get("MASTER_PORT", "").strip().lower()
+    if port is None and env_port not in {"", "0", "auto"}:
+        try:
+            port = int(env_port)
+        except ValueError:
+            pass
+
+    os.environ["MASTER_PORT"] = str(_find_free_port() if port in (None, 0) else port)
+
+
+@dataclass
+class RewardDataCollatorWithPadding:
+    # docstyle-ignore
+    r"""
+    Reward DataCollator class that pads the inputs to the maximum length of the batch.
+
+    > [!WARNING]
+    > This class is deprecated and will be removed in version 0.27.0. Please use
+    `trl.trainer.reward_trainer.DataCollatorForPreference` instead.
+
+    Args:
+        tokenizer ([`~transformers.PreTrainedTokenizerBase`]):
+            The tokenizer used for encoding the data.
+        padding (`bool | str | PaddingStrategy`, `optional`, defaults to `True`):
+            padding_strategy to pass to the tokenizer.
+        pad_to_multiple_of (`int` or `None`, `optional`, defaults to `None`):
+            If set will pad the sequence to a multiple of the provided value.
+        return_tensors (`str`, `optional`, defaults to `"pt"`):
+            The tensor type to use.
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    padding: bool | str = True
+    pad_to_multiple_of: int | None = None
+    return_tensors: str = "pt"
+
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The `RewardDataCollatorWithPadding` is deprecated and will be removed in version 0.27.0. Please use "
+            "`trl.trainer.reward_trainer.DataCollatorForPreference` instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
+        features_chosen = []
+        features_rejected = []
+        margin = []
+        # check if we have a margin. If we do, we need to batch it as well
+        has_margin = "margin" in features[0]
+        for feature in features:
+            # check if the keys are named as expected
+            if (
+                "input_ids_chosen" not in feature
+                or "input_ids_rejected" not in feature
+                or "attention_mask_chosen" not in feature
+                or "attention_mask_rejected" not in feature
+            ):
+                raise ValueError(
+                    "The features should include `input_ids_chosen`, `attention_mask_chosen`, `input_ids_rejected` and `attention_mask_rejected`"
+                )
+
+            features_chosen.append(
+                {
+                    "input_ids": feature["input_ids_chosen"],
+                    "attention_mask": feature["attention_mask_chosen"],
+                }
+            )
+            features_rejected.append(
+                {
+                    "input_ids": feature["input_ids_rejected"],
+                    "attention_mask": feature["attention_mask_rejected"],
+                }
+            )
+            if has_margin:
+                margin.append(feature["margin"])
+        batch_chosen = self.tokenizer.pad(
+            features_chosen,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        batch_rejected = self.tokenizer.pad(
+            features_rejected,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        batch = {
+            "input_ids_chosen": batch_chosen["input_ids"],
+            "attention_mask_chosen": batch_chosen["attention_mask"],
+            "input_ids_rejected": batch_rejected["input_ids"],
+            "attention_mask_rejected": batch_rejected["attention_mask"],
+            "return_loss": True,
+        }
+        if has_margin:
+            margin = torch.tensor(margin, dtype=torch.float)
+            batch["margin"] = margin
+        return batch
+
+
+def pad(
+    tensors: list[torch.Tensor],
+    padding_value: int = 0,
+    padding_side: str = "right",
+    pad_to_multiple_of: int | None = None,
+) -> torch.Tensor:
+    """
+    Pads a list of tensors to the same shape along the first dimension.
+
+    Args:
+        tensors (`list[torch.Tensor]`):
+            List of input tensors to pad.
+        padding_value (`int`):
+            Value to use for padding. Default is 0.
+        padding_side (`str`):
+            Side on which to add padding. Must be 'left' or 'right'. Default is 'right'.
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+
+    Returns
+    -------
+        `torch.Tensor`:
+            A single tensor containing the padded tensors.
+
+    Examples
+    --------
+    ```python
+    >>> import torch
+
+    >>> pad([torch.tensor([1, 2, 3]), torch.tensor([4, 5])])
+    tensor([[1, 2, 3],
+            [4, 5, 0]])
+
+    >>> pad([torch.tensor([[1, 2], [3, 4]]), torch.tensor([[5, 6]])])
+    tensor([[[1, 2],
+            [3, 4]],
+            [[5, 6],
+            [0, 0]]])
+    ```
+    """
+    # Determine the maximum shape for each dimension
+    output_shape = np.max([t.shape for t in tensors], 0).tolist()
+
+    # Apply pad_to_multiple_of to the first (sequence) dimension
+    if pad_to_multiple_of is not None:
+        remainder = output_shape[0] % pad_to_multiple_of
+        if remainder != 0:
+            output_shape[0] += pad_to_multiple_of - remainder
+
+    # Create an output tensor filled with the padding value
+    output = torch.full(
+        (len(tensors), *output_shape),
+        padding_value,
+        dtype=tensors[0].dtype,
+        device=tensors[0].device,
+    )
+
+    for i, t in enumerate(tensors):
+        if padding_side == "left":
+            seq_start = output_shape[0] - t.shape[0]
+        elif padding_side == "right":
+            seq_start = 0
+        else:
+            raise ValueError("padding_side must be 'left' or 'right'")
+
+        # Define the slices
+        seq_slice = slice(seq_start, seq_start + t.shape[0])
+        slices = (seq_slice,) + tuple(slice(0, s) for s in t.shape[1:])
+        output[i][slices] = t
+
+    return output
+
+
+@dataclass
+class DPODataCollatorWithPadding:
+    r"""
+    DPO DataCollator class that pads the tokenized inputs to the maximum length of the batch.
+
+    Args:
+        pad_token_id (`int` defaults to 0):
+            The tokenizer's pad_token_id.
+        label_pad_token_id (`int`, defaults to -100):
+            The label used for masking.
+        is_encoder_decoder (`bool` or `None`, `optional`, defaults to `None`):
+            Whether you model has an encoder_decoder architecture.
+    """
+
+    pad_token_id: int = 0
+    label_pad_token_id: int = -100
+    is_encoder_decoder: bool | None = False
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
+        # first, pad everything to the same length
+        padded_batch = {}
+        for k in features[0].keys():
+            if k.endswith(
+                ("_input_ids", "_attention_mask", "_labels", "_pixel_values")
+            ):
+                if self.is_encoder_decoder:
+                    to_pad = [torch.LongTensor(ex[k]) for ex in features]
+
+                    if (k.startswith("prompt")) and (k.endswith("input_ids")):
+                        if self.pad_token_id is None:
+                            raise ValueError(
+                                "Padding is enabled, but the tokenizer is not configured with a padding token."
+                                " Explicitly set `tokenizer.pad_token` (e.g. `tokenizer.pad_token = tokenizer.eos_token`)"
+                                " before calling the trainer."
+                            )
+                        padding_value = self.pad_token_id
+                    elif k.endswith("_attention_mask"):
+                        padding_value = 0
+                    elif k.startswith(("chosen", "rejected", "completion")) or (
+                        "decoder" in k
+                    ):
+                        padding_value = self.label_pad_token_id
+                    else:
+                        raise ValueError(f"Unexpected key in batch '{k}'")
+                    padded_batch[k] = pad_sequence(
+                        to_pad, batch_first=True, padding_value=padding_value
+                    )
+                else:
+                    # Set padding value based on the key
+                    if k.endswith("_input_ids"):
+                        if self.pad_token_id is None:
+                            raise ValueError(
+                                "Padding is enabled, but the tokenizer is not configured with a padding token."
+                                " Explicitly set `tokenizer.pad_token` (e.g. `tokenizer.pad_token = tokenizer.eos_token`)"
+                                " before calling the trainer."
+                            )
+                        padding_value = self.pad_token_id
+                    elif k.endswith("_labels"):
+                        padding_value = self.label_pad_token_id
+                    elif k.endswith("_attention_mask"):
+                        padding_value = 0
+                    elif k.endswith("_pixel_values"):
+                        padding_value = 0  # TODO: check if this is correct
+                    else:
+                        raise ValueError(f"Unexpected key in batch '{k}'")
+
+                    # Set padding side based on the key
+                    if k in ["prompt_input_ids", "prompt_attention_mask"]:
+                        padding_side = "left"
+                    else:
+                        padding_side = "right"
+
+                    # Set the dtype
+                    if k.endswith("_pixel_values"):
+                        dtype = (
+                            torch.float32
+                        )  # will be downcasted if necessary by the Trainer
+                    else:
+                        dtype = torch.int64
+
+                    # Convert to tensor and pad
+                    to_pad = [torch.tensor(ex[k], dtype=dtype) for ex in features]
+                    padded_batch[k] = pad(
+                        to_pad, padding_value=padding_value, padding_side=padding_side
+                    )
+            elif k.endswith("_logps"):
+                # the cached reference model logprobs
+                padded_batch[k] = torch.tensor([ex[k] for ex in features])
+            else:
+                padded_batch[k] = [ex[k] for ex in features]
+
+        return padded_batch
+
+
+@dataclass
+class RunningMoments:
+    """
+    Calculates the running mean and standard deviation of a data stream. Reference:
+    https://github.com/OpenLMLab/MOSS-RLHF/blob/40b91eb2f2b71b16919addede0341d2bef70825d/utils.py#L75
+    """
+
+    accelerator: Accelerator
+    mean: float = 0
+    std: float = 1
+    var: float = 1
+    count: float = 1e-24
+
+    @torch.no_grad()
+    def update(self, xs: torch.Tensor) -> tuple[float, float]:
+        """
+        Updates running moments from batch's moments computed across ranks
+        """
+        if self.accelerator.use_distributed:
+            xs_mean, xs_var, xs_count = get_global_statistics(self.accelerator, xs)
+        else:
+            xs_count = xs.numel()
+            xs_var, xs_mean = torch.var_mean(xs, unbiased=False)
+        xs_mean, xs_var = xs_mean.float(), xs_var.float()
+
+        delta = xs_mean - self.mean
+        tot_count = self.count + xs_count
+
+        new_sum = xs_var * xs_count
+        # correct old_sum deviation accounting for the new mean
+        old_sum = self.var * self.count + delta**2 * self.count * xs_count / tot_count
+        tot_sum = old_sum + new_sum
+
+        self.mean += (delta * xs_count / tot_count).item()
+        new_var = tot_sum / tot_count
+        self.std = (new_var * tot_count / (tot_count - 1)).float().sqrt().item()
+        self.var = new_var.item()
+        self.count = tot_count
+
+        return xs_mean.item(), (
+            xs_var * xs_count / (xs_count - 1)
+        ).float().sqrt().item()
+
+    def save_to_json(self, json_path: str):
+        """Save the content of this instance in JSON format inside `json_path`."""
+        # save everything except accelerator
+        if self.accelerator.is_main_process:
+            save_dict = dataclasses.asdict(
+                self,
+                dict_factory=lambda x: {k: v for (k, v) in x if k != "accelerator"},
+            )
+            json_string = json.dumps(save_dict, indent=2, sort_keys=True) + "\n"
+            with open(json_path, "w", encoding="utf-8") as f:
+                f.write(json_string)
+
+    @classmethod
+    def load_from_json(cls, accelerator: Accelerator, json_path: str):
+        """Create an instance from the content of `json_path`."""
+        # load everything except accelerator
+        with open(json_path, encoding="utf-8") as f:
+            text = f.read()
+        return cls(accelerator=accelerator, **json.loads(text))
+
+
+@torch.no_grad()
+def get_global_statistics(
+    accelerator, xs: torch.Tensor, mask=None, device="cpu"
+) -> tuple[torch.Tensor, torch.Tensor, int]:
+    """
+    Computes element-wise mean and variance of the tensor across processes. Reference:
+    https://github.com/OpenLMLab/MOSS-RLHF/blob/40b91eb2f2b71b16919addede0341d2bef70825d/utils.py#L57C1-L73C75
+    """
+    xs = xs.to(accelerator.device)
+    sum_and_count = torch.tensor(
+        [xs.sum(), (xs.numel() if mask is None else mask.sum())], device=xs.device
+    )
+    sum_and_count = accelerator.reduce(sum_and_count)
+    global_sum, count = sum_and_count
+    global_mean = global_sum / count
+
+    sum_var = torch.sum(((xs - global_mean) ** 2).mul(1 if mask is None else mask))
+    sum_var = accelerator.reduce(sum_var)
+    global_var = sum_var / count
+
+    return global_mean.to(device), global_var.to(device), count.item()
+
+
+def compute_accuracy(eval_pred: EvalPrediction) -> dict[str, float]:
+    predictions, labels = eval_pred
+    if predictions.ndim == 3:
+        # Token classification task. Shapes are (batch_size, seq_len, num_labels) and (batch_size, seq_len)
+        # Used to compute the accuracy in the prm_trainer.
+        predictions = np.argmax(predictions, axis=2)
+
+        # Flatten the predictions and labels to remove the ignored tokens.
+        predictions = np.array(
+            [
+                p
+                for prediction, label in zip(predictions, labels, strict=True)
+                for (p, lbl) in zip(prediction, label, strict=True)
+                if lbl != -100
+            ]
+        )
+        labels = np.array([lbl for label in labels for lbl in label if lbl != -100])
+
+    else:
+        # Here, predictions is rewards_chosen and rewards_rejected. Shapes are (batch_size, 2) and (batch_size,)
+        # We want to see how much of the time rewards_chosen > rewards_rejected.
+        equal_mask = predictions[:, 0] == predictions[:, 1]
+        equal_predictions_count = int(equal_mask.sum())
+
+        if equal_predictions_count > 0:
+            # Before using the logger, the accelerate state must be initialized. It'susually the case when using this
+            # function inside a Trainer, but it may not be the case otherwise, in particular when unit testing.
+            PartialState()
+
+            logger.warning(
+                f"There are {equal_predictions_count} out of {len(predictions[:, 0])} instances where the predictions "
+                "for both options are equal. These instances are ignored in the accuracy computation.",
+            )
+
+        # Filter out equal predictions
+        predictions = predictions[~equal_mask]
+        labels = labels[~equal_mask]
+
+        # Use the remaining predictions for accuracy calculation
+        predictions = np.argmax(predictions, axis=1)
+
+    accuracy = np.array(predictions == labels, dtype=float).mean().item()
+    return {"accuracy": accuracy}
+
+
+def pad_to_length(
+    tensor: torch.Tensor, length: int, pad_value: int | float, dim: int = -1
+) -> torch.Tensor:
+    if tensor.size(dim) >= length:
+        return tensor
+    pad_size = list(tensor.shape)
+    pad_size[dim] = length - tensor.size(dim)
+    return torch.cat(
+        [
+            tensor,
+            pad_value * torch.ones(*pad_size, dtype=tensor.dtype, device=tensor.device),
+        ],
+        dim=dim,
+    )
+
+
+def disable_dropout_in_model(model: torch.nn.Module) -> None:
+    for module in model.modules():
+        if isinstance(module, torch.nn.Dropout):
+            module.p = 0
+
+
+def exact_div(a, b, custom_error_message=""):
+    q = a // b
+    if a != q * b:
+        raise ValueError(
+            f"{custom_error_message}, inexact division: {a} / {b} = {a / b}"
+        )
+    return q
+
+
+def peft_module_casting_to_bf16(model):
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.LayerNorm) or "norm" in name:
+            module = module.to(torch.float32)
+        elif any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]):
+            if hasattr(module, "weight"):
+                if module.weight.dtype == torch.float32:
+                    module = module.to(torch.bfloat16)
+
+
+def get_quantization_config(model_args: ModelConfig) -> BitsAndBytesConfig | None:
+    if model_args.load_in_4bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=model_args.dtype,  # For consistency with model weights, we use the same value as `dtype`
+            bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
+            bnb_4bit_use_double_quant=model_args.use_bnb_nested_quant,
+            bnb_4bit_quant_storage=model_args.dtype,
+        )
+    elif model_args.load_in_8bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+        )
+    else:
+        quantization_config = None
+
+    return quantization_config
+
+
+def get_kbit_device_map() -> dict[str, int] | None:
+    if torch.cuda.is_available() or is_torch_xpu_available():
+        return {"": PartialState().local_process_index}
+    return None
+
+
+def get_peft_config(model_args: ModelConfig) -> "PeftConfig | None":
+    if model_args.use_peft is False:
+        return None
+
+    if not is_peft_available():
+        raise ValueError(
+            "You need to have PEFT library installed in your environment, make sure to install `peft`. "
+            "Make sure to run `pip install -U peft`."
+        )
+
+    peft_config = LoraConfig(
+        task_type=model_args.lora_task_type,
+        r=model_args.lora_r,
+        target_modules=model_args.lora_target_modules,
+        lora_alpha=model_args.lora_alpha,
+        lora_dropout=model_args.lora_dropout,
+        bias="none",
+        use_rslora=model_args.use_rslora,
+        use_dora=model_args.use_dora,
+        modules_to_save=model_args.lora_modules_to_save,
+    )
+
+    return peft_config
+
+
+def get_exp_cap(value, decimal=4):
+    """
+    Get the exponent cap of a value. This is used to cap the exponent of a value to avoid overflow. The formula is :
+    log(value.dtype.max) E.g. for float32 data type, the maximum exponent value is 88.7228 to 4 decimal points.
+
+    Args:
+        value (`torch.Tensor`):
+            The input tensor to obtain the data type
+        decimal (`int`):
+            The number of decimal points of the output exponent cap. eg: direct calling exp(log(torch.float32.max))
+            will result in inf so we cap the exponent to 88.7228 to avoid overflow.
+    """
+    vdtype_max = torch.zeros([1]).to(value.dtype) + torch.finfo(value.dtype).max
+    vdtype_log_max = torch.log(vdtype_max).to(value.device)
+    return (
+        torch.floor(vdtype_log_max * 10**decimal) / 10**decimal
+        if decimal > 0
+        else vdtype_log_max
+    )
+
+
+def cap_exp(value, cap=-1):
+    # Cap the exponent value below the upper-bound to avoid overflow, before calling torch.exp
+    cap = get_exp_cap(value) if cap < 0 else cap
+    return torch.exp(torch.clamp(value, max=cap))
+
+
+def print_rich_table(df: pd.DataFrame) -> None:
+    if not is_rich_available():
+        raise ImportError(
+            "The function `print_rich_table` requires the `rich` library. Please install it with `pip install rich`."
+        )
+    console = Console()
+    table = Table(show_lines=True)
+    for column in df.columns:
+        table.add_column(column)
+    for _, row in df.iterrows():
+        table.add_row(*row.astype(str).tolist())
+    console.print(table)
+
+
+SIMPLE_CHAT_TEMPLATE = "{% for message in messages %}{{message['role'].capitalize() + ': ' + message['content'] + '\n\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
+
+
+@dataclass
+class OnlineTrainerState(TrainerState):
+    """
+    Training state for online/on-policy trainers.
+
+    Extends [`~transformers.TrainerState`] with an `episode` counter to track the current rollout/episode.
+
+    Args:
+        episode (`int`, defaults to 0): Zero-based episode index.
+    """
+
+    episode: int = 0
+
+
+@dataclass
+class OnPolicyConfig(TrainingArguments):
+    r"""
+    Base configuration class for on-policy trainers.
+
+    This class includes only the parameters that are specific to some on-policy training. For a full list of training
+    arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this
+    class may differ from those in [`~transformers.TrainingArguments`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters
+    ----------
+        run_name (`str`, *optional*):
+            Name of the run.
+        dataset_num_proc (`int`, *optional*):
+            Number of processes to use for processing the dataset.
+        num_mini_batches (`int`, *optional*, defaults to `1`):
+            Number of minibatches to split a batch into.
+        total_episodes (`int`, *optional*):
+            Total number of episodes in the dataset.
+        local_rollout_forward_batch_size (`int`, *optional*, defaults to `64`):
+            Per rank no grad forward pass in the rollout phase.
+        num_sample_generations (`int`, *optional*, defaults to `10`):
+            Number of debugging samples generations (i.e., `generate_completions` calls) throughout training.
+        response_length (`int`, *optional*, defaults to `53`):
+            Length of the response.
+        stop_token (`str`, *optional*):
+            Specifies the stop token to use for text generation. This parameter is mutually exclusive with
+            `stop_token_id`.
+
+            - `None`: No stop token is applied, unless `stop_token_id` is specified.
+            - `'eos'`: Uses the tokenizer's `eos_token`.
+
+        stop_token_id (`int`, *optional*):
+            Specifies the ID of the stop token to use for text generation. If `None`, no stop token ID is applied,
+            unless `stop_token` is specified. This parameter is mutually exclusive with `stop_token`.
+        temperature (`float`, *optional*, defaults to `0.7`):
+            Sampling temperature.
+        missing_eos_penalty (`float`, *optional*):
+            Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage to
+            generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive
+            value.
+        sft_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
+            Path to the SFT model.
+        world_size (`int`, *optional*):
+            Number of processes (GPUs) to use for the training.
+        num_total_batches (`int`, *optional*):
+            Number of total batches to train.
+        micro_batch_size (`int`, *optional*):
+            Micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`).
+        local_batch_size (`int`, *optional*):
+            Batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`).
+        batch_size (`int`, *optional*):
+            Batch size across devices (HF's `per_device_train_batch_size` * `world_size` *
+            `gradient_accumulation_steps`).
+        local_mini_batch_size (`int`, *optional*):
+            Mini batch size per GPU.
+        mini_batch_size (`int`, *optional*):
+            Mini batch size across GPUs.
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether to push the model to the Hub after training.
+    """
+
+    # Parameters whose default values are overridden from TrainingArguments
+    logging_steps: float = field(
+        default=10,
+        metadata={
+            "help": "Log every X updates steps. Should be an integer or a float in range `[0,1)`. If smaller than 1, "
+            "will be interpreted as ratio of total training steps."
+        },
+    )
+    gradient_checkpointing: bool = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    bf16: bool | None = field(
+        default=None,
+        metadata={
+            "help": "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA "
+            "architecture or Intel XPU or using CPU (use_cpu) or Ascend NPU. If not set, it defaults to `True` if "
+            "`fp16` is not set."
+        },
+    )
+
+    run_name: str | None = field(
+        default=None,
+        metadata={"help": "Name of the run."},
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    num_mini_batches: int = field(
+        default=1,
+        metadata={"help": "Number of minibatches to split a batch into."},
+    )
+    total_episodes: int | None = field(
+        default=None,
+        metadata={"help": "Total number of episodes in the dataset."},
+    )
+    local_rollout_forward_batch_size: int = field(
+        default=64,
+        metadata={"help": "Per rank no grad forward pass in the rollout phase."},
+    )
+    num_sample_generations: int = field(
+        default=10,
+        metadata={
+            "help": "Number of debugging samples generations (i.e., `generate_completions` calls) throughout training."
+        },
+    )
+    response_length: int = field(
+        default=53,
+        metadata={"help": "Length of the response."},
+    )
+    stop_token: Literal["eos"] | None = field(
+        default=None,
+        metadata={
+            "help": "Specifies the stop token to use for text generation. This parameter is mutually exclusive with "
+            "`stop_token_id`."
+        },
+    )
+    stop_token_id: int | None = field(
+        default=None,
+        metadata={
+            "help": "Specifies the ID of the stop token to use for text generation. If `None`, no stop token ID is "
+            "applied, unless `stop_token` is specified. This parameter is mutually exclusive with `stop_token`."
+        },
+    )
+    temperature: float = field(
+        default=0.7,
+        metadata={"help": "Sampling temperature."},
+    )
+    missing_eos_penalty: float | None = field(
+        default=None,
+        metadata={
+            "help": "Penalty applied to the score when the model fails to generate an EOS token. This is useful to "
+            "encourage to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be "
+            "a positive value."
+        },
+    )
+    sft_model_path: str = field(
+        default="EleutherAI/pythia-160m",
+        metadata={"help": "Path to the SFT model."},
+    )
+    world_size: int | None = field(
+        default=None,
+        metadata={"help": "Number of processes (GPUs) to use for the training."},
+    )
+    num_total_batches: int | None = field(
+        default=None,
+        metadata={"help": "Number of total batches to train."},
+    )
+    micro_batch_size: int | None = field(
+        default=None,
+        metadata={
+            "help": "Micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`)."
+        },
+    )
+    local_batch_size: int | None = field(
+        default=None,
+        metadata={
+            "help": "Batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`)."
+        },
+    )
+    batch_size: int | None = field(
+        default=None,
+        metadata={
+            "help": "Batch size across devices (HF's `per_device_train_batch_size` * `world_size` * "
+            "`gradient_accumulation_steps`)."
+        },
+    )
+    local_mini_batch_size: int | None = field(
+        default=None,
+        metadata={"help": "Mini batch size per GPU."},
+    )
+    mini_batch_size: int | None = field(
+        default=None,
+        metadata={"help": "Mini batch size across GPUs."},
+    )
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the model to the Hub after training."},
+    )
+
+    def __post_init__(self):
+        self.bf16 = not (self.fp16) if self.bf16 is None else self.bf16
+
+        super().__post_init__()
+
+
+def first_true_indices(bools: torch.Tensor, dtype=torch.long) -> torch.Tensor:
+    """
+    Takes an N-dimensional bool tensor and returns an (N-1)-dimensional tensor of integers giving the position of the
+    first True in each "row".
+
+    Returns the length of the rows (bools.size(-1)) if no element is True in a given row.
+
+    Args:
+        bools (`torch.Tensor`):
+            An N-dimensional boolean tensor.
+        dtype (`torch.dtype`, optional):
+            The desired data type of the output tensor. Defaults to `torch.long`.
+
+    Returns
+    -------
+        `torch.Tensor`:
+            An (N-1)-dimensional tensor of integers indicating the position of the first True in each row. If no True
+            value is found in a row, returns the length of the row.
+    """
+    row_len = bools.size(-1)
+    zero_or_index = row_len * (~bools).type(dtype) + torch.arange(
+        row_len, dtype=dtype, device=bools.device
+    )
+    return torch.min(zero_or_index, dim=-1).values
+
+
+def get_reward(
+    model: torch.nn.Module,
+    query_responses: torch.Tensor,
+    pad_token_id: int,
+    context_length: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Computes the reward logits and the rewards for a given model and query responses.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model used to compute the reward logits.
+        query_responses (`torch.Tensor`):
+            The tensor containing the query responses.
+        pad_token_id (`int`):
+            The token ID representing the pad token.
+        context_length (`int`):
+            The length of the context in the query responses.
+
+    Returns
+    -------
+        tuple:
+            - `reward_logits` (`torch.Tensor`):
+                The logits for the reward model.
+            - `final_rewards` (`torch.Tensor`):
+                The final rewards for each query response.
+            - `sequence_lengths` (`torch.Tensor`):
+                The lengths of the sequences in the query responses.
+    """
+    attention_mask = query_responses != pad_token_id
+    position_ids = attention_mask.cumsum(1) - attention_mask.long()  # exclusive cumsum
+    lm_backbone = getattr(model, model.base_model_prefix)
+    input_ids = torch.masked_fill(query_responses, ~attention_mask, 0)
+    output = lm_backbone(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        return_dict=True,
+        output_hidden_states=True,
+        use_cache=False,  # otherwise mistral-based RM would error out
+    )
+    reward_logits = model.score(output.hidden_states[-1])
+    sequence_lengths = (
+        first_true_indices(query_responses[:, context_length:] == pad_token_id)
+        - 1
+        + context_length
+    )
+    # https://github.com/huggingface/transformers/blob/dc68a39c8111217683bf49a4912d0c9018bab33d/src/transformers/models/gpt2/modeling_gpt2.py#L1454
+    return (
+        reward_logits,
+        reward_logits[
+            torch.arange(reward_logits.size(0), device=reward_logits.device),
+            sequence_lengths,
+        ].squeeze(-1),
+        sequence_lengths,
+    )
+
+
+def forward(
+    model: torch.nn.Module,
+    query_responses: torch.Tensor,
+    pad_token_id: int,
+) -> ModelOutput:
+    """
+    Performs a forward pass through the model with the given query responses and pad token ID.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to perform the forward pass.
+        query_responses (`torch.Tensor`):
+            The tensor containing the query responses.
+        pad_token_id (`int`):
+            The token ID representing the pad token.
+
+    Returns
+    -------
+        `ModelOutput`:
+            The output of the model, including hidden states.
+    """
+    attention_mask = query_responses != pad_token_id
+    position_ids = attention_mask.cumsum(1) - attention_mask.long()
+    input_ids = torch.masked_fill(query_responses, ~attention_mask, 0)
+    return model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        return_dict=True,
+        output_hidden_states=True,
+    )
+
+
+def prepare_deepspeed(
+    model: torch.nn.Module,
+    per_device_train_batch_size: int,
+    fp16: bool = False,
+    bf16: bool = False,
+) -> torch.nn.Module:
+    """
+    Prepares the model for training with DeepSpeed (both for stage 2 and 3), configuring the appropriate settings based
+    on the model and batch size.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to be prepared for DeepSpeed training.
+        per_device_train_batch_size (`int`):
+            The training batch size per device.
+        fp16 (`bool`, defaults to `False`):
+            Whether to use FP16 precision.
+        bf16 (`bool`, defaults to `False`):
+            Whether to use BF16 precision.
+
+    Returns
+    -------
+        `torch.nn.Module`:
+            The model initialized and configured with DeepSpeed for training.
+    """
+    import deepspeed
+
+    deepspeed_plugin = AcceleratorState().deepspeed_plugin
+    config_kwargs = deepspeed_plugin.deepspeed_config
+    if config_kwargs["zero_optimization"]["stage"] != 3:
+        config_kwargs["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
+        config_kwargs = {
+            "train_micro_batch_size_per_gpu": config_kwargs[
+                "train_micro_batch_size_per_gpu"
+            ],
+            "prescale_gradients": False,
+            "wall_clock_breakdown": False,
+        }
+        if bf16:
+            config_kwargs["bf16"] = {"enabled": True}
+        elif fp16:
+            config_kwargs["fp16"] = {"enabled": True}
+    elif hasattr(model, "config"):
+        hidden_size = (
+            max(model.config.hidden_sizes)
+            if getattr(model.config, "hidden_sizes", None)
+            else getattr(model.config, "hidden_size", None)
+        )
+        if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
+            # Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
+            # This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
+            config_kwargs.update(
+                {
+                    "zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
+                    "zero_optimization.stage3_param_persistence_threshold": 10
+                    * hidden_size,
+                    "zero_optimization.stage3_prefetch_bucket_size": 0,
+                }
+            )
+    model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
+    model.eval()
+    return model
+
+
+def truncate_response(
+    stop_token_id: int, pad_token_id: int, responses: torch.Tensor
+) -> torch.Tensor:
+    """
+    Truncates the responses at the first occurrence of the stop token, filling the rest with pad tokens.
+
+    Args:
+        stop_token_id (`int`):
+            The token ID representing the stop token where truncation occurs.
+        pad_token_id (`int`):
+            The token ID representing the pad token used to fill the truncated responses.
+        responses (`torch.Tensor`):
+            The tensor containing the responses to be truncated.
+
+    Returns
+    -------
+        `torch.Tensor`:
+            The truncated responses tensor with pad tokens filled after the stop token.
+    """
+    trunc_idxs = first_true_indices(responses == stop_token_id).unsqueeze(-1)
+    new_size = [1] * (len(responses.size()) - 1) + [responses.shape[1]]
+    idxs = torch.arange(responses.shape[1], device=responses.device).view(*new_size)
+    postprocessed_responses = torch.masked_fill(
+        responses, idxs > trunc_idxs, pad_token_id
+    )
+    return postprocessed_responses
+
+
+def generate(
+    lm_backbone: torch.nn.Module,
+    queries: torch.Tensor,
+    pad_token_id: int,
+    generation_config: GenerationConfig,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Generates sequences from the language model backbone in a way that does not affect padding tokens.
+
+    Args:
+        lm_backbone (`torch.nn.Module`):
+            The language model backbone used for generation.
+        queries (`torch.Tensor`):
+            The tensor containing the input queries.
+        pad_token_id (`int`):
+            The token ID representing the pad token.
+        generation_config ([`~transformers.GenerationConfig`]):
+            The configuration for the generation process.
+
+    Returns
+    -------
+        tuple:
+            - `generated_sequences` (`torch.Tensor`):
+                The concatenated tensor of input queries and generated sequences.
+            - `logits` (`torch.Tensor`):
+                The logits output from the generation process.
+    """
+    context_length = queries.shape[1]
+    attention_mask = queries != pad_token_id
+    input_ids = torch.masked_fill(queries, ~attention_mask, 0)
+    output = lm_backbone.generate(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        # position_ids=attention_mask.cumsum(1) - attention_mask.long(), # not needed: already adjusted in generations
+        # https://github.com/huggingface/transformers/blob/ac33aeeeee2a7a89b89c93c2962e6feb90daef0a/src/transformers/models/gpt2/modeling_gpt2.py#L1227-L1250
+        generation_config=generation_config,
+        return_dict_in_generate=True,
+        output_scores=True,
+    )
+    logits = torch.stack(output.scores, 1)
+    return torch.cat((queries, output.sequences[:, context_length:]), dim=1), logits
+
+
+@torch.no_grad()
+def batch_generation(
+    model: torch.nn.Module,
+    queries: torch.Tensor,
+    local_rollout_forward_batch_size: int,
+    pad_token_id: int,
+    generation_config: GenerationConfig,
+):
+    query_responses = []
+    logitss = []
+    batch_size = queries.shape[0]
+    for i in range(0, batch_size, local_rollout_forward_batch_size):
+        query = queries[i : i + local_rollout_forward_batch_size]
+        query_response, logits = generate(
+            model,
+            query,
+            pad_token_id,
+            generation_config,
+        )
+        query_responses.append(query_response)
+        logitss.append(logits)
+
+    # padding tensors
+    padded_query_responses = pad(
+        query_responses, padding_value=pad_token_id, padding_side="right"
+    )
+    padded_logitss = pad(logitss, padding_value=0, padding_side="right")
+
+    # reshaping
+    padded_query_responses = padded_query_responses.view(
+        -1, padded_query_responses.shape[-1]
+    )[:batch_size]
+    padded_logitss = padded_logitss.view(-1, *padded_logitss.shape[2:])[:batch_size]
+
+    return padded_query_responses, padded_logitss
+
+
+def add_bos_token_if_needed(
+    bos_token_id: int | None,
+    prompt_len_input_ids: int,
+    prompt_tokens: dict[str, list[int]],
+    chosen_prompt_len_input_ids: int,
+    chosen_tokens: dict[str, list[int]],
+    rejected_prompt_len_input_ids: int,
+    rejected_tokens: dict[str, list[int]],
+):
+    if bos_token_id is not None:
+        if (
+            prompt_len_input_ids == 0
+            or bos_token_id != prompt_tokens["prompt_input_ids"][0]
+        ):
+            prompt_tokens["prompt_input_ids"] = [bos_token_id] + prompt_tokens[
+                "prompt_input_ids"
+            ]
+            prompt_tokens["prompt_attention_mask"] = [1] + prompt_tokens[
+                "prompt_attention_mask"
+            ]
+        if (
+            chosen_prompt_len_input_ids == 0
+            or bos_token_id != chosen_tokens["prompt_input_ids"][0]
+        ):
+            chosen_tokens["prompt_input_ids"] = [bos_token_id] + chosen_tokens[
+                "prompt_input_ids"
+            ]
+            chosen_tokens["prompt_attention_mask"] = [1] + chosen_tokens[
+                "prompt_attention_mask"
+            ]
+        if (
+            rejected_prompt_len_input_ids == 0
+            or bos_token_id != rejected_tokens["prompt_input_ids"][0]
+        ):
+            rejected_tokens["prompt_input_ids"] = [bos_token_id] + rejected_tokens[
+                "prompt_input_ids"
+            ]
+            rejected_tokens["prompt_attention_mask"] = [1] + rejected_tokens[
+                "prompt_attention_mask"
+            ]
+    return prompt_tokens, chosen_tokens, rejected_tokens
+
+
+def add_eos_token_if_needed(
+    eos_token_id: int,
+    chosen_tokens: dict[str, list[int]],
+    rejected_tokens: dict[str, list[int]],
+):
+    if (
+        len(chosen_tokens["input_ids"]) == 0
+        or eos_token_id != chosen_tokens["input_ids"][-1]
+    ):
+        chosen_tokens["input_ids"].append(eos_token_id)
+        chosen_tokens["attention_mask"].append(1)
+    if (
+        len(rejected_tokens["input_ids"]) == 0
+        or eos_token_id != rejected_tokens["input_ids"][-1]
+    ):
+        rejected_tokens["input_ids"].append(eos_token_id)
+        rejected_tokens["attention_mask"].append(1)
+    return chosen_tokens, rejected_tokens
+
+
+def truncate_right(
+    input_ids: torch.Tensor, stop_token_id: int, pad_token_id: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Truncates the input tensor from the right side after the first occurrence of the stop token.
+
+    Args:
+        input_ids (`torch.Tensor`):
+            The tensor containing the responses to be truncated
+        stop_token_id (`int`):
+            The token ID representing the stop token where truncation occurs
+        pad_token_id (`int`):
+            The token ID representing the pad token used to fill the truncated responses
+
+    Returns
+    -------
+        tuple:
+            - `output_ids` (`torch.Tensor`):
+                The truncated responses tensor with pad tokens filled after the stop token
+            - `mask` (`torch.Tensor`):
+                The mask tensor to indicate the padding tokens
+    """
+    trunc_idxs = first_true_indices(input_ids == stop_token_id).unsqueeze(-1)
+    new_size = [1] * (len(input_ids.size()) - 1) + [input_ids.shape[1]]
+    idxs = torch.arange(input_ids.shape[1], device=input_ids.device).view(*new_size)
+    output_ids = torch.masked_fill(input_ids, idxs > trunc_idxs, pad_token_id)
+    mask = torch.masked_fill(torch.ones_like(input_ids), idxs > trunc_idxs, 0)
+    return output_ids, mask
+
+
+def empty_cache() -> None:
+    """Empties the cache of the available torch device.
+
+    This function checks for the availability of different torch devices (XPU, MLU, NPU, CUDA) and empties the cache of
+    the first available device it finds.
+
+    If none of the specific devices are available, it defaults to emptying the CUDA cache.
+    """
+    if is_torch_xpu_available():
+        torch.xpu.empty_cache()
+    elif is_torch_mlu_available():
+        torch.mlu.empty_cache()
+    elif is_torch_npu_available():
+        torch.npu.empty_cache()
+    else:
+        torch.cuda.empty_cache()
+
+
+def generate_model_card(
+    base_model: str | None,
+    model_name: str,
+    hub_model_id: str,
+    dataset_name: str | None,
+    tags: list[str],
+    wandb_url: str | None,
+    trainer_name: str,
+    trainer_citation: str | None = None,
+    template_file: str | None = None,
+    paper_title: str | None = None,
+    paper_id: str | None = None,
+    comet_url: str | None = None,
+) -> ModelCard:
+    """
+    Generate a [`~huggingface_hub.ModelCard`] from a template.
+
+    Args:
+        base_model (`str` or `None`):
+            Base model name.
+        model_name (`str`):
+            Model name.
+        hub_model_id (`str`):
+            Hub model ID as `username/model_id`.
+        dataset_name (`str` or `None`):
+            Dataset name.
+        tags (`list[str]`):
+            Tags.
+        wandb_url (`str` or `None`):
+            Weights & Biases run URL.
+        comet_url (`str` or `None`):
+            Comet experiment URL.
+        trainer_name (`str`):
+            Trainer name.
+        trainer_citation (`str` or `None`, defaults to `None`):
+            Trainer citation as a BibTeX entry.
+        template_file (`str` *optional*):
+            Template file name located in the `trl/templates` directory. Defaults to `lm_model_card.md`.
+        paper_title (`str` or `None`, defaults to `None`):
+            Paper title.
+        paper_id (`str` or `None`, defaults to `None`):
+            ArXiv paper ID as `YYMM.NNNNN`.
+
+    Returns
+    -------
+        [`~huggingface_hub.ModelCard`]:
+            A ModelCard object.
+    """
+    card_data = ModelCardData(
+        base_model=base_model,
+        datasets=dataset_name,
+        library_name="transformers",
+        licence="license",
+        model_name=model_name,
+        tags=["generated_from_trainer", *tags],
+    )
+    template_file = template_file or "lm_model_card.md"
+    card = ModelCard.from_template(
+        card_data,
+        template_path=str(
+            pkg_resources.files("trl").joinpath(f"templates/{template_file}")
+        ),
+        base_model=base_model,
+        model_name=model_name,
+        hub_model_id=hub_model_id,
+        dataset_name=dataset_name,
+        wandb_url=wandb_url,
+        comet_url=comet_url,
+        trainer_name=trainer_name,
+        trainer_citation=trainer_citation,
+        paper_title=paper_title,
+        paper_id=paper_id,
+        trl_version=version("trl"),
+        transformers_version=version("transformers"),
+        pytorch_version=version("torch"),
+        datasets_version=version("datasets"),
+        tokenizers_version=version("tokenizers"),
+    )
+    return card
+
+
+def get_comet_experiment_url() -> str | None:
+    """
+    If Comet integration is enabled, return the URL of the current Comet experiment; otherwise, return `None`.
+    """
+    if not is_comet_available():
+        return None
+
+    if comet_ml.get_running_experiment() is not None:
+        return comet_ml.get_running_experiment().url
+
+    return None
+
+
+def log_table_to_comet_experiment(name: str, table: pd.DataFrame) -> None:
+    """
+    If Comet integration is enabled logs a table to the Comet experiment if it is currently running.
+
+    Args:
+        name (`str`):
+            Table name.
+        table (`pandas.DataFrame`):
+            The Pandas DataFrame containing the table to log.
+    """
+    if not is_comet_available():
+        raise ModuleNotFoundError(
+            "The comet-ml is not installed. Please install it first: pip install comet-ml"
+        )
+
+    experiment = comet_ml.get_running_experiment()
+    if experiment is not None:
+        experiment.log_table(tabular_data=table, filename=name)
+
+
+def flush_left(
+    mask: torch.Tensor, *tensors: torch.Tensor
+) -> torch.Tensor | tuple[torch.Tensor, ...]:
+    """
+    Shift non-zero elements in the mask and corresponding tensors to the left.
+
+    This function operates on a binary mask and any number of additional tensors with the same dimensions as the mask.
+    For each row, non-zero values are shifted to the leftmost positions. Then, columns that contain only zeros across
+    all rows are truncated from the mask and tensors. Visually, this operation can be represented as follows:
+
+    ```
+    [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+     [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+    ```
+
+    Args:
+        mask (`torch.Tensor`):
+            2D tensor (binary mask) with shape `(N, M)`.
+        *tensors (`torch.Tensor`):
+            One or more 2D tensors with the same shape as `mask`. These tensors will be processed alongside `mask`,
+            with non-zero values shifted and excess zero columns truncated in the same manner.
+
+    Returns
+    -------
+        `torch.Tensor`:
+            Updated binary mask with non-zero values flushed to the left and trailing zero columns removed.
+        `*torch.Tensor`
+            Updated tensors, processed in the same way as the mask.
+
+    Example:
+    ```python
+    >>> mask = torch.tensor([[0, 0, 1, 1, 1], [0, 1, 1, 0, 0]])
+    >>> tensor = torch.tensor([[9, 9, 2, 3, 4], [9, 5, 6, 9, 9]])
+    >>> new_mask, new_tensor = flush_left(mask, tensor)
+    >>> print(new_mask)
+    tensor([[1, 1, 1],
+            [1, 1, 0]])
+
+    >>> print(new_tensor)
+    tensor([[2, 3, 4],
+            [5, 6, 0]])
+    ```
+    """
+    _, M = mask.shape
+
+    # Create copy of mask and tensors
+    mask_copy = mask.clone()
+    tensors = [t.clone() for t in tensors]
+
+    # Shift non-zero values to the left
+    first_non_zero = mask_copy.argmax(dim=1)
+    pos = torch.arange(M, device=mask_copy.device).unsqueeze(0)
+    idx_roll = (pos + first_non_zero.unsqueeze(1)) % M
+    mask_roll = mask_copy.gather(1, idx_roll)
+    rolled_tensors = [t.gather(1, idx_roll) for t in tensors]
+
+    # Truncate trailing columns that are all zeros in mask_roll
+    col_sums = mask_roll.sum(dim=0)
+    empty_cols = col_sums == 0
+    first_empty_col = int(empty_cols.to(torch.int8).argmax()) if empty_cols.any() else M
+    flushed_mask = mask_roll[:, :first_empty_col]
+    flushed_tensors = [t[:, :first_empty_col] for t in rolled_tensors]
+
+    if not flushed_tensors:
+        return flushed_mask
+    return flushed_mask, *flushed_tensors
+
+
+def flush_right(
+    mask: torch.Tensor, *tensors: torch.Tensor
+) -> torch.Tensor | tuple[torch.Tensor, ...]:
+    """
+    Shift non-zero elements in the mask and corresponding tensors to the right. See `flush_left` for details.
+    """
+    _, M = mask.shape
+
+    # Create copy of mask and tensors
+    mask_copy = mask.clone()
+    tensors = [t.clone() for t in tensors]
+
+    # Shift non-zero values to the right
+    flipped_mask = torch.fliplr(mask_copy)
+    first_non_zero = flipped_mask.argmax(dim=1)
+    pos = torch.arange(M, device=mask_copy.device).unsqueeze(0)
+    idx_roll = (pos - first_non_zero.unsqueeze(1)) % M
+    mask_roll = mask_copy.gather(1, idx_roll)
+    rolled_tensors = [t.gather(1, idx_roll) for t in tensors]
+
+    # Truncate leading columns that are all zeros in mask_roll
+    col_sums = mask_roll.sum(dim=0)
+    non_empty_cols = col_sums != 0
+    first_non_empty_col = (
+        int(non_empty_cols.to(torch.int8).argmax()) if non_empty_cols.any() else M
+    )
+    flushed_mask = mask_roll[:, first_non_empty_col:]
+    flushed_tensors = [t[:, first_non_empty_col:] for t in rolled_tensors]
+
+    if not flushed_tensors:
+        return flushed_mask
+    return flushed_mask, *flushed_tensors
+
+
+def selective_log_softmax(logits, index) -> torch.Tensor:
+    """
+    A memory-efficient implementation of the common `log_softmax -> gather` operation.
+
+    This function is equivalent to the following naive implementation:
+    ```python
+    logps = torch.gather(
+        logits.log_softmax(-1), dim=-1, index=index.unsqueeze(-1)
+    ).squeeze(-1)
+    ```
+
+    Args:
+        logits (`torch.Tensor`):
+            Logits tensor of shape `(..., num_classes)`.
+        index (`torch.Tensor`):
+            Index tensor of shape `(...)`, specifying the positions to gather from the log-softmax output.
+
+    Returns
+    -------
+        `torch.Tensor`:
+            Gathered log probabilities with the same shape as `index`.
+    """
+    if logits.dtype in [torch.float32, torch.float64]:
+        selected_logits = torch.gather(
+            logits, dim=-1, index=index.unsqueeze(-1)
+        ).squeeze(-1)
+        # loop to reduce peak mem consumption
+        logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits])
+        per_token_logps = (
+            selected_logits - logsumexp_values
+        )  # log_softmax(x_i) = x_i - logsumexp(x)
+    else:
+        # logsumexp approach is unstable with bfloat16, fall back to slightly less efficient approach
+        per_token_logps = []
+        for row_logits, row_labels in zip(
+            logits, index, strict=True
+        ):  # loop to reduce peak mem consumption
+            row_logps = F.log_softmax(row_logits, dim=-1)
+            row_per_token_logps = row_logps.gather(
+                dim=-1, index=row_labels.unsqueeze(-1)
+            ).squeeze(-1)
+            per_token_logps.append(row_per_token_logps)
+        per_token_logps = torch.stack(per_token_logps)
+    return per_token_logps
+
+
+def entropy_from_logits(logits: torch.Tensor, chunk_size: int = 128) -> torch.Tensor:
+    """
+    Compute the Shannon entropy (in nats) for each row of *logits* in a memory-efficient way.
+
+    Instead of materializing the full softmax for all rows at once, the logits are flattened to shape (N, num_classes),
+    where N is the product of all leading dimensions. Computation is then performed in chunks of size `chunk_size`
+    along this flattened dimension, reducing peak memory usage. The result is reshaped back to match the input's
+    leading dimensions.
+
+    Args:
+        logits (`torch.Tensor`):
+            Logits tensor of shape `(..., num_classes)`. Entropy is taken along the last axis; all leading dimensions
+            are preserved in the output.
+        chunk_size (`int`, *optional*, defaults to `128`):
+            Number of rows from the flattened logits to process per iteration. Smaller values reduce memory usage at
+            the cost of more iterations.
+
+    Returns
+    -------
+        `torch.Tensor`:
+            Entropy values with shape `logits.shape[:-1]`.
+    """
+    original_shape = logits.shape[:-1]  # all dims except num_classes
+    num_classes = logits.shape[-1]
+
+    # Flatten all leading dimensions into one
+    flat_logits = logits.reshape(-1, num_classes)
+
+    entropies = []
+    for chunk in flat_logits.split(chunk_size, dim=0):
+        logps = F.log_softmax(chunk, dim=-1)
+        chunk_entropy = -(torch.exp(logps) * logps).sum(-1)
+        entropies.append(chunk_entropy)
+
+    entropies = torch.cat(entropies, dim=0)
+    return entropies.reshape(original_shape)
+
+
+def print_prompt_completions_sample(
+    prompts: list,
+    completions: list,
+    rewards: dict[str, list[float]],
+    advantages: list[float],
+    step: int,
+    num_samples: int = None,
+) -> None:
+    """
+    Print out a sample of model completions to the console with multiple reward metrics.
+
+    This function creates a nicely formatted table showing prompt-completion pairs, useful for monitoring model outputs
+    during training. It requires the `rich` library to be installed.
+
+    Args:
+        prompts (`list`):
+            List of prompts. Can be either strings or lists of messages.
+        completions (`list`):
+            List of completions corresponding to the prompts. Can be either strings or lists of messages.
+        rewards (`dict[str, list[float]]`):
+            Dictionary where keys are reward names and values are lists of rewards.
+        advantages (`list[float]`):
+            List of advantages corresponding to the prompts and completions.
+        step (`int`):
+            Current training step number, used in the output title.
+        num_samples (`int`, *optional*):
+            Number of random samples to display. If `None` (default), all items will be displayed.
+
+    Example:
+    ```python
+    >>> from trl.trainer.utils import print_prompt_completions_sample
+
+    >>> prompts = ["The sky is", "The sun is"]
+    >>> completions = [" blue.", " in the sky."]
+    >>> rewards = {"Correctness": [0.123, 0.456], "Format": [0.789, 0.101]}
+    >>> advantages = [0.987, 0.654]
+    >>> print_prompt_completions_sample(prompts, completions, rewards, advantages, 42)
+    ╭──────────────────────────── Step 42 ─────────────────────────────╮
+    │ ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━┓ │
+    │ ┃ Prompt     ┃ Completion   ┃ Correctness ┃ Format ┃ Advantage ┃ │
+    │ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━┩ │
+    │ │ The sky is │  blue.       │        0.12 │   0.79 │      0.99 │ │
+    │ ├────────────┼──────────────┼─────────────┼────────┼───────────┤ │
+    │ │ The sun is │  in the sky. │        0.46 │   0.10 │      0.65 │ │
+    │ └────────────┴──────────────┴─────────────┴────────┴───────────┘ │
+    ╰──────────────────────────────────────────────────────────────────╯
+    ```
+    """
+    if not is_rich_available():
+        raise ImportError(
+            "The function `print_prompt_completions_sample` requires the `rich` library. Please install it with "
+            "`pip install rich`."
+        )
+    console = Console()
+    table = Table(show_header=True, header_style="bold white", expand=True)
+
+    # Add columns
+    table.add_column("Prompt", style="bright_yellow")
+    table.add_column("Completion", style="bright_green")
+    for reward_name in rewards:
+        table.add_column(reward_name, style="bold cyan", justify="right")
+    table.add_column("Advantage", style="bold magenta", justify="right")
+
+    def format_entry(entry) -> Text:
+        t = Text()
+        if isinstance(entry, list) and all(isinstance(m, dict) for m in entry):
+            for j, msg in enumerate(entry):
+                role = msg.get("role", "")
+                if "content" in msg:
+                    # Chat message
+                    t.append(f"{role.upper()}\n", style="bold red")
+                    t.append(msg["content"])
+                elif "name" in msg and "args" in msg:
+                    # Tool call
+                    t.append(f"{role.upper()}\n", style="bold red")
+                    t.append(f"{msg['name']}({msg['args']})")
+                else:
+                    # Fallback
+                    t.append(str(msg))
+                if j < len(entry) - 1:
+                    t.append("\n\n")
+        else:
+            t.append(str(entry))
+        return t
+
+    # Some basic input validation
+    if num_samples is not None:
+        if num_samples >= len(prompts):
+            num_samples = None
+        elif num_samples <= 0:
+            return
+
+    # Subsample data if num_samples is specified
+    if num_samples is not None:
+        indices = random.sample(range(len(prompts)), num_samples)
+        prompts = [prompts[i] for i in indices]
+        completions = [completions[i] for i in indices]
+        rewards = {key: [val[i] for i in indices] for key, val in rewards.items()}
+        advantages = [advantages[i] for i in indices]
+
+    for i in range(len(prompts)):
+        reward_values = [
+            f"{rewards[key][i]:.2f}" for key in rewards.keys()
+        ]  # 2 decimals
+        table.add_row(
+            format_entry(prompts[i]),
+            format_entry(completions[i]),
+            *reward_values,
+            f"{advantages[i]:.2f}",
+        )
+        table.add_section()  # Adds a separator between rows
+
+    panel = Panel(table, expand=False, title=f"Step {step}", border_style="bold white")
+    console.print(panel)
+
+
+class RepeatSampler(Sampler):
+    """
+    Sampler that repeats the indices of a dataset in a structured manner.
+
+    Args:
+        data_source (`Sized`):
+            Dataset to sample from.
+        mini_repeat_count (`int`):
+            Number of times to repeat each index per batch.
+        batch_size (`int`, *optional*, defaults to `1`):
+            Number of unique indices per batch.
+        repeat_count (`int`, *optional*, defaults to `1`):
+            Number of times to repeat the full sampling process.
+        shuffle (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the dataset.
+        seed (`int`, *optional*):
+            Random seed for reproducibility (only affects this sampler).
+
+    Example:
+    ```python
+    >>> sampler = RepeatSampler(["a", "b", "c", "d", "e", "f", "g"], mini_repeat_count=2, batch_size=3, repeat_count=4)
+    >>> list(sampler)
+    [4, 4, 3, 3, 0, 0,
+     4, 4, 3, 3, 0, 0,
+     4, 4, 3, 3, 0, 0,
+     4, 4, 3, 3, 0, 0,
+     1, 1, 2, 2, 6, 6,
+     1, 1, 2, 2, 6, 6,
+     1, 1, 2, 2, 6, 6,
+     1, 1, 2, 2, 6, 6]
+    ```
+
+    ```txt
+    mini_repeat_count = 3
+          -   -   -
+         [0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,      |
+          4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,      |
+          8,  8,  8,  9,  9,  9, 10, 10, 10, 11, 11, 11,      |
+                                                                repeat_count = 2
+          0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,      |
+          4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,      |
+          8,  8,  8,  9,  9,  9, 10, 10, 10, 11, 11, 11, ...] |
+          ---------   ---------   ---------   ---------
+           ---------   ---------   ---------   ---------
+            ---------   ---------   ---------   ---------
+                         batch_size = 12
+    ```
+    """
+
+    def __init__(
+        self,
+        data_source: Sized,
+        mini_repeat_count: int,
+        batch_size: int = 1,
+        repeat_count: int = 1,
+        shuffle: bool = True,
+        seed: int | None = None,
+    ):
+        self.data_source = data_source
+        self.mini_repeat_count = mini_repeat_count
+        self.batch_size = batch_size
+        self.repeat_count = repeat_count
+        self.num_samples = len(data_source)
+        self.shuffle = shuffle
+        self.seed = seed
+
+        if shuffle:
+            self.generator = torch.Generator()  # Create a local random generator
+            if seed is not None:
+                self.generator.manual_seed(seed)
+
+    def __iter__(self):
+        if self.shuffle:
+            # E.g., [2, 4, 3, 1, 0, 6, 5] (num_samples = 7)
+            indexes = torch.randperm(
+                self.num_samples, generator=self.generator
+            ).tolist()
+        else:
+            indexes = list(range(self.num_samples))
+
+        #    [2, 4, 3, 1, 0, 6, 5]
+        # -> [[2, 4, 3], [1, 0, 6], [5]]  (batch_size = 3)
+        indexes = [
+            indexes[i : i + self.batch_size]
+            for i in range(0, len(indexes), self.batch_size)
+        ]
+
+        #    [[2, 4, 3], [1, 0, 6], [5]]
+        # -> [[2, 4, 3], [1, 0, 6]]
+        indexes = [chunk for chunk in indexes if len(chunk) == self.batch_size]
+
+        for chunk in indexes:
+            for _ in range(self.repeat_count):
+                for index in chunk:
+                    for _ in range(self.mini_repeat_count):
+                        yield index
+
+    def __len__(self) -> int:
+        return (
+            (self.num_samples // self.batch_size)
+            * self.batch_size
+            * self.mini_repeat_count
+            * self.repeat_count
+        )
+
+
+# torch.nanstd doesn't exist, so we define it here
+def nanstd(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the standard deviation of a tensor, ignoring NaNs. This function only supports 1D tensors.
+
+    Args:
+        tensor (`torch.Tensor`):
+            Input tensor of shape `(N,)`.
+
+    Returns
+    -------
+        `torch.Tensor`:
+            Standard deviation of the tensor, ignoring NaNs.
+    """
+    variance = torch.nanmean(
+        (tensor - torch.nanmean(tensor, keepdim=True)) ** 2
+    )  # Compute variance ignoring NaNs
+    count = torch.sum(~torch.isnan(tensor))  # Count of non-NaN values
+    variance *= count / (count - 1)  # Bessel's correction
+    return torch.sqrt(variance)
+
+
+def split_tensor_dict(
+    tensor_dict: dict[str, torch.Tensor | None], num_chunks: int
+) -> list[dict[str, torch.Tensor | None]]:
+    """
+    Splits a dictionary of tensors along the first dimension into `num_chunks` equal parts.
+
+    Example:
+    ```python
+    >>> x = torch.arange(12).reshape(6, 2)
+    >>> y = torch.arange(6).reshape(6, 1)
+    >>> tensor_dict = {"x": x, "y": y}
+    >>> split_tensor_dict(tensor_dict, 3)
+    [
+        {"x": tensor([[0, 1], [2, 3]]), "y": tensor([[0], [1]])},
+        {"x": tensor([[4, 5], [6, 7]]), "y": tensor([[2], [3]])},
+        {"x": tensor([[ 8,  9], [10, 11]]), "y": tensor([[4], [5]])}
+    ]
+    ```
+    """
+    first_tensor = next(tensor for tensor in tensor_dict.values() if tensor is not None)
+    chunk_size = first_tensor.shape[0] // num_chunks
+    chunks = []
+    for i in range(num_chunks):
+        chunk_dict = {}
+        for key, tensor in tensor_dict.items():
+            if tensor is not None and (isinstance(tensor, list) or tensor.ndim > 0):
+                chunk_dict[key] = tensor[i * chunk_size : (i + 1) * chunk_size]
+            elif tensor is not None and tensor.ndim == 0:
+                chunk_dict[key] = tensor
+            else:
+                chunk_dict[key] = None
+        chunks.append(chunk_dict)
+    return chunks
+
+
+def shuffle_sequence_dict(
+    seq_dict: dict[str, Sequence | None],
+) -> dict[str, Sequence | None]:
+    """
+    Shuffles all sequence-like values in a dictionary along the first dimension in unison.
+
+    Example:
+    ```python
+    >>> x = torch.arange(6).reshape(3, 2)
+    >>> y = ["a", "b", "c"]
+    >>> seq_dict = {"x": x, "y": y}
+    >>> shuffle_sequence_dict(seq_dict)
+    {'x': tensor([[2, 3],
+                  [0, 1],
+                  [4, 5]]),
+     'y': ['b', 'a', 'c']}
+    ```
+    """
+    # Determine batch size from the first non-None sequence
+    batch_size = len(next(v for v in seq_dict.values() if v is not None))
+    permutation = torch.randperm(batch_size)
+
+    def permute(v: Sequence | None) -> Sequence | None:
+        if v is None:
+            return None
+        if isinstance(v, torch.Tensor) and v.ndim == 0:
+            return v
+        if isinstance(v, torch.Tensor) and v.ndim >= 1:
+            return v[permutation]
+        return [v[i] for i in permutation]
+
+    return {key: permute(val) for key, val in seq_dict.items()}
+
+
+def nanmin(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the minimum value of a tensor, ignoring NaNs. This function only supports 1D tensors.
+
+    Args:
+        tensor (`torch.Tensor`): Input tensor of shape `(N,)`.
+
+    Returns
+    -------
+        `torch.Tensor`: Minimum value of the tensor, ignoring NaNs. Returns NaN if all values are NaN.
+    """
+    if torch.isnan(tensor).all():
+        return torch.tensor(float("nan"), dtype=tensor.dtype, device=tensor.device)
+    return torch.min(tensor[~torch.isnan(tensor)])
+
+
+def nanmax(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the maximum value of a tensor, ignoring NaNs. This function only supports 1D tensors.
+
+    Args:
+        tensor (`torch.Tensor`): Input tensor of shape `(N,)`.
+
+    Returns
+    -------
+        `torch.Tensor`: Maximum value of the tensor, ignoring NaNs. Returns NaN if all values are NaN.
+    """
+    if torch.isnan(tensor).all():
+        return torch.tensor(float("nan"), dtype=tensor.dtype, device=tensor.device)
+    return torch.max(tensor[~torch.isnan(tensor)])
+
+
+def identity(x):
+    """Do we really need docs for this?"""
+    return x
+
+
+def split_pixel_values_by_grid(
+    batch: dict[str, torch.Tensor],
+) -> dict[str, torch.Tensor | list[torch.Tensor]]:
+    """
+    Splits `batch["pixel_values"]` into a list of tensors based on the product of each row in `batch["image_grid_thw"]`
+    and batch["num_images"] while keeping other entries unchanged.
+    """
+    if (
+        "image_grid_thw" not in batch
+        or "pixel_values" not in batch
+        or "num_images" not in batch
+    ):
+        return batch
+
+    lengths = batch["image_grid_thw"].prod(-1).tolist()  # [num_images]
+    pixel_values = batch["pixel_values"]  # [total, feature_dim]
+
+    if sum(lengths) != pixel_values.size(0):
+        raise ValueError(
+            f"Mismatch: sum(lengths) = {sum(lengths)} != pixel_values.size(0) = {pixel_values.size(0)}"
+        )
+
+    boundaries = [0, *accumulate(batch["num_images"])]  # [3, 4, 5] -> [0, 3, 7, 12]
+    sections = [
+        sum(lengths[boundaries[i] : boundaries[i + 1]])
+        for i in range(len(batch["num_images"]))
+    ]
+    split_values = list(torch.split(batch["pixel_values"], sections, dim=0))
+    image_grid_thw = list(
+        torch.split(batch["image_grid_thw"], batch["num_images"], dim=0)
+    )
+    return {**batch, "pixel_values": split_values, "image_grid_thw": image_grid_thw}
+
+
+def unsplit_pixel_values_by_grid(
+    batch: dict[str, torch.Tensor | list[torch.Tensor]],
+) -> dict[str, torch.Tensor]:
+    """
+    Opposite of `split_pixel_values_by_grid`. Merges a list of tensors in `batch["pixel_values"]` back into a single
+    tensor along the first dimension.
+    """
+    pixel_values = batch.get("pixel_values")
+    if isinstance(pixel_values, list):
+        merged = torch.cat(pixel_values, dim=0)
+        batch = {**batch, "pixel_values": merged}
+
+    image_grid_thw = batch.get("image_grid_thw")
+    if isinstance(image_grid_thw, list):
+        merged = torch.cat(image_grid_thw, dim=0)
+        batch = {**batch, "image_grid_thw": merged}
+
+    return batch
+
+
+TListOrMapping = TypeVar("TListOrMapping", list, Mapping)
+
+
+def remove_none_values(example: TListOrMapping) -> TListOrMapping:
+    """
+    Recursively removes entries with `None` values from a nested structure (list or dictionary).
+
+    Args:
+        example (`list` or `Mapping`):
+            Input nested structure (list or dictionary) from which to remove `None`.
+
+    Example:
+    ```python
+    >>> [
+    ...     {
+    ...         "a": {"aa": None, "ab": 1},
+    ...         "b": "my_string",
+    ...     }
+    ... ]
+    >>> remove_none_values(example)
+    [{'a': {'ab': 1}, 'b': 'my_string'}]
+    ```
+    """
+    if isinstance(example, list):
+        return [
+            remove_none_values(value) if isinstance(value, (dict, list)) else value
+            for value in example
+        ]
+    if isinstance(example, Mapping):
+        return {
+            key: remove_none_values(value) if isinstance(value, (dict, list)) else value
+            for key, value in example.items()
+            if value is not None
+        }
+    raise TypeError("Input must be a list or a dictionary.")
+
+
+def create_model_from_path(model_id: str, **kwargs) -> PreTrainedModel:
+    """
+    Create a model from a given path using the specified initialization arguments.
+
+    Args:
+        model_id (`str`):
+            Path to the model. Can be either a local directory or a model identifier from the Hugging Face Hub.
+        kwargs (`dict`):
+            Initialization keyword arguments to pass to the model's `from_pretrained` method. When `'dtype'` is
+            specified, it can be either a `torch.dtype` or one of the strings: `'bfloat16'`, `'float16'`, `'float32'`,
+            or `'auto'`.
+
+    Returns
+    -------
+        [`~transformers.PreTrainedModel`]:
+            The instantiated model.
+    """
+    dtype = kwargs.get("dtype")
+    if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
+        pass  # dtype is already a torch.dtype or "auto" or None
+    elif isinstance(dtype, str) and dtype in ["bfloat16", "float16", "float32"]:
+        kwargs["dtype"] = getattr(torch, dtype)
+    else:
+        raise ValueError(
+            "Invalid `dtype` passed to the config. Expected either 'auto' or a string representing "
+            f"a valid `torch.dtype` (e.g., 'float32'), but got {dtype}."
+        )
+    config = AutoConfig.from_pretrained(model_id)
+    architecture = getattr(transformers, config.architectures[0])
+    model = architecture.from_pretrained(model_id, **kwargs)
+    return model
+
+
+def get_config_model_id(config: PretrainedConfig) -> str:
+    """
+    Retrieve the model identifier from a given model configuration.
+
+    Args:
+        config ([`~transformers.PreTrainedConfig`]):
+            Configuration from which to extract the model identifier.
+
+    Returns
+    -------
+        `str`:
+            The model identifier associated with the model configuration.
+    """
+    # Fall back to `config.text_config._name_or_path` if `config._name_or_path` is missing: Qwen2-VL and Qwen2.5-VL. See GH-4323
+    return getattr(config, "_name_or_path", "") or getattr(
+        getattr(config, "text_config", None), "_name_or_path", ""
+    )
diff --git a/src/aixpert/training/training/trl/trainer/xpo_config.py b/src/aixpert/training/training/trl/trainer/xpo_config.py
new file mode 100644
index 0000000..a05cecb
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/xpo_config.py
@@ -0,0 +1,27 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from ..experimental.xpo import XPOConfig as _XPOConfig
+
+
+class XPOConfig(_XPOConfig):
+    def __post_init__(self):
+        warnings.warn(
+            "The `XPOConfig` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.xco import XPOConfig`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
+        )
+        super().__post_init__()
diff --git a/src/aixpert/training/training/trl/trainer/xpo_trainer.py b/src/aixpert/training/training/trl/trainer/xpo_trainer.py
new file mode 100644
index 0000000..c537f88
--- /dev/null
+++ b/src/aixpert/training/training/trl/trainer/xpo_trainer.py
@@ -0,0 +1,27 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from ..experimental.xpo import XPOTrainer as _XPOTrainer
+
+
+class XPOTrainer(_XPOTrainer):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The `XPOTrainer` is now located in `trl.experimental`. Please update your imports to "
+            "`from trl.experimental.xpo import XPOTrainer`. The current import path will be removed and no longer "
+            "supported in TRL 0.29. For more information, see https://github.com/huggingface/trl/issues/4223."
+        )
+        super().__init__(*args, **kwargs)
diff --git a/src/aixpert/training/utils/config_loader.py b/src/aixpert/training/utils/config_loader.py
new file mode 100644
index 0000000..88b3060
--- /dev/null
+++ b/src/aixpert/training/utils/config_loader.py
@@ -0,0 +1,17 @@
+"""Utility module for loading the global YAML configuration file."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict
+
+import yaml
+
+
+CONFIG_PATH = Path(__file__).resolve().parents[1] / "config" / "config.yaml"
+
+
+def load_config() -> Dict[str, Any]:
+    """Load YAML config into a dictionary."""
+    with open(CONFIG_PATH, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
diff --git a/src/aixpert/training/utils/factual_trainer_utils.py b/src/aixpert/training/utils/factual_trainer_utils.py
new file mode 100644
index 0000000..885e157
--- /dev/null
+++ b/src/aixpert/training/utils/factual_trainer_utils.py
@@ -0,0 +1,104 @@
+"""
+Utility functions for preparing datasets and models for DPO and Factual-DPO.
+
+Includes helpers for JSONL loading, Unsloth model setup, and config construction.
+"""
+
+import json
+from typing import Any, Dict, Tuple
+
+import torch
+from datasets import Dataset
+from transformers import PreTrainedTokenizerBase
+from trl import DPOConfig
+from unsloth import FastLanguageModel
+
+
+def load_and_clean_jsonl(path: str) -> Dataset:
+    """Load a JSONL factual dataset and convert it into a HF Dataset."""
+    rows = []
+    with open(path, "r") as f:
+        for line in f:
+            ex = json.loads(line)
+            rows.append(
+                {
+                    "prompt": ex.get("prompt", ""),
+                    "chosen": ex.get("chosen", ""),
+                    "rejected": ex.get("rejected", ""),
+                    "h_w": float(ex.get("h_w", 0)),
+                    "h_l": float(ex.get("h_l", 0)),
+                }
+            )
+    return Dataset.from_list(rows)
+
+
+def load_unsloth_model(model_name: str, max_seq_length: int) -> Tuple[Any, Any]:
+    """Load a 4-bit Unsloth QLoRA model, tokenizer and LoRA adapters applied."""
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=model_name,
+        max_seq_length=max_seq_length,
+        dtype=None,
+        load_in_4bit=True,
+        device_map=None,
+    )
+
+    model.config.use_flash_attention_2 = True
+
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.model_max_length = max_seq_length
+    tokenizer.padding_side = "right"
+    tokenizer.truncation_side = "left"
+
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=32,
+        lora_alpha=64,
+        lora_dropout=0.05,
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+        use_gradient_checkpointing=True,
+    )
+    return model, tokenizer
+
+
+def build_dpo_config(
+    hp: Dict[str, Any],
+    tokenizer: PreTrainedTokenizerBase,
+    delta: float,
+    output_dir: str,
+) -> DPOConfig:
+    """Build a TRL DPOConfig object with the Safe-DPO factual margin Δ."""
+    cfg = DPOConfig(
+        output_dir=output_dir,
+        beta=hp["beta"],
+        num_train_epochs=hp["num_train_epochs"],
+        per_device_train_batch_size=hp["per_device_train_batch_size"],
+        per_device_eval_batch_size=hp["per_device_train_batch_size"],
+        gradient_accumulation_steps=hp["gradient_accumulation_steps"],
+        learning_rate=hp["learning_rate"],
+        warmup_ratio=hp["warmup_ratio"],
+        lr_scheduler_type="cosine",
+        optim="paged_adamw_8bit",
+        bf16=torch.cuda.is_bf16_supported(),
+        fp16=not torch.cuda.is_bf16_supported(),
+        save_strategy="steps",
+        save_steps=hp["save_steps"],
+        logging_steps=hp["logging_steps"],
+        remove_unused_columns=False,
+        max_length=hp["max_seq_length"],
+        max_prompt_length=hp["max_seq_length"] // 2,
+        padding_value=tokenizer.pad_token_id,
+        ddp_find_unused_parameters=False,
+        report_to=["wandb"],
+        resume_from_checkpoint=True,
+    )
+
+    cfg.delta = delta
+    return cfg
diff --git a/src/aixpert/training/utils/trainer_utils.py b/src/aixpert/training/utils/trainer_utils.py
new file mode 100644
index 0000000..32fb24c
--- /dev/null
+++ b/src/aixpert/training/utils/trainer_utils.py
@@ -0,0 +1,119 @@
+"""
+Utility functions for Original-DPO training.
+
+Includes dataset loading, model setup, LoRA configuration, and construction
+of a TRL DPO trainer.
+"""
+
+import json
+from typing import Any, Tuple
+
+import pandas as pd
+import torch
+from datasets import Dataset
+from trl import DPOConfig, DPOTrainer
+from unsloth import FastLanguageModel
+
+
+def load_dataset_for_dpo(jsonl_path: str) -> Dataset:
+    """Load a JSONL file containing prompt/chosen/rejected triples."""
+    rows = []
+    with open(jsonl_path, "r") as f:
+        for line in f:
+            rows.append(json.loads(line))
+
+    df = pd.DataFrame(rows)
+    ds = Dataset.from_pandas(df, preserve_index=False)
+
+    return ds.map(
+        lambda x: {
+            "prompt": x["prompt"],
+            "chosen": x["chosen"],
+            "rejected": x["rejected"],
+        }
+    )
+
+
+def load_model_and_tokenizer(
+    model_name: str, max_seq_length: int, load_in_4bit: bool = True
+) -> Tuple[Any, Any]:
+    """Load an Unsloth QLoRA model and tokenizer enabled."""
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=model_name,
+        max_seq_length=max_seq_length,
+        load_in_4bit=load_in_4bit,
+        dtype=None,
+        device_map=None,
+    )
+
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.model_max_length = max_seq_length
+    tokenizer.padding_side = "right"
+    tokenizer.truncation_side = "left"
+
+    model.config.use_flash_attention_2 = True
+
+    return model, tokenizer
+
+
+def apply_lora(model: Any, hp: dict) -> Any:
+    """Apply LoRA adapters to the model using hyperparameters inside config.yaml."""
+    return FastLanguageModel.get_peft_model(
+        model,
+        r=hp["lora_r"],
+        lora_alpha=hp["lora_alpha"],
+        lora_dropout=hp["lora_dropout"],
+        use_gradient_checkpointing=True,
+        bias="none",
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+    )
+
+
+def build_dpo_trainer(
+    model: Any,
+    tokenizer: Any,
+    train_data: Dataset,
+    eval_data: Dataset,
+    cfg: dict,
+    output_dir: str,
+) -> DPOTrainer:
+    """Build and return a TRL DPOTrainer for Original-DPO."""
+    training_args = DPOConfig(
+        output_dir=output_dir,
+        beta=cfg["beta"],
+        num_train_epochs=cfg["num_epochs"],
+        per_device_train_batch_size=cfg["batch_size"],
+        per_device_eval_batch_size=cfg["batch_size"],
+        gradient_accumulation_steps=cfg["grad_accumulation"],
+        learning_rate=cfg["learning_rate"],
+        warmup_ratio=cfg["warmup_ratio"],
+        lr_scheduler_type="cosine",
+        optim="paged_adamw_8bit",
+        save_steps=cfg["save_steps"],
+        logging_steps=cfg["logging_steps"],
+        seed=cfg["seed"],
+        remove_unused_columns=False,
+        max_length=cfg["max_seq_length"],
+        max_prompt_length=cfg["max_seq_length"] // 2,
+        padding_value=tokenizer.pad_token_id,
+        report_to="none",
+        bf16=torch.cuda.is_bf16_supported(),
+        fp16=not torch.cuda.is_bf16_supported(),
+    )
+
+    return DPOTrainer(
+        model=model,
+        ref_model=None,
+        args=training_args,
+        train_dataset=train_data,
+        eval_dataset=eval_data,
+        processing_class=tokenizer,
+    )