VectorInstitute · Sindhuja217 · Jan 2, 2026 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml
@@ -24,6 +24,9 @@ on:
       - uv.lock
       - pyproject.toml
       - '**.ipynb'
+    paths-ignore:
+      - "trl/**"
+      - "**/factualdpo_trainer.py"
 
 jobs:
   run-code-check:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -49,6 +49,7 @@ repos:
       - id: typos
         args: []
 
+
   - repo: https://github.com/nbQA-dev/nbQA
     rev: 1.9.1
     hooks:

diff --git a/pyproject.toml b/pyproject.toml
@@ -120,7 +120,10 @@ mypy_path = "src"
 # -----------------------------------------------------
 [tool.ruff]
 include = ["*.py", "pyproject.toml", "*.ipynb"]
-exclude = []
+exclude = [
+    "trl",
+    "**/factualdpo_trainer.py",
+]
 line-length = 88
 
 [tool.ruff.format]

diff --git a/src/aixpert/data_construction/config/config.yaml b/src/aixpert/data_construction/config/config.yaml
@@ -1,5 +1,3 @@
-repository: /projects/aixpert/users/sindhu/Loss_Test
-
 model:
   name: gpt-4o-mini   # or gpt-4o
   temperature: 0.8
@@ -36,6 +34,9 @@ paths:
   train_flipped_out: "src/aixpert/data_construction/data/train_balanced_flipped.jsonl"
   eval_flipped_out: "src/aixpert/data_construction/data/eval_final_flipped.jsonl"
 
+  final_train: "src/aixpert/data_construction/data/train_final_processed.jsonl"
+  final_eval: "src/aixpert/data_construction/data/eval_final_processed.jsonl"
+
 
 
   skywork_file: "Skywork/Skywork-Reward-Preference-80K-v0.1"
@@ -59,3 +60,8 @@ hyperparams:
       "(1,1)": 10000
 
   eval_additional_clean_samples: 1500
+
+dataset_processing:
+  keep_keys: ["prompt", "chosen", "rejected", "h_w", "h_l", "flipped"]
+
+openai_api_key: "${OPENAI_API_KEY}"
diff --git a/src/aixpert/data_construction/stage_9_last/data_train.py b/src/aixpert/data_construction/stage_9_last/data_train.py
@@ -0,0 +1,24 @@
+"""This script performs the final cleanup step for the train dataset (flipping + key filtering)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from utils.config_loader import load_config
+from utils.data_utils import process_jsonl_with_flip
+
+
+def main():
+    """Load paths and run the final processing stage for the train dataset."""
+    paths = load_config()["paths"]
+
+    input_path = Path(paths["train_flipped_out"])
+    output_path = Path(paths["final_train"])
+
+    process_jsonl_with_flip(input_path=input_path, output_path=output_path)
+
+    print("Train dataset final processing completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/stage_9_last/data_val.py b/src/aixpert/data_construction/stage_9_last/data_val.py
@@ -0,0 +1,24 @@
+"""This script performs the final cleanup step for the train dataset (flipping + key filtering)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from utils.config_loader import load_config
+from utils.data_utils import process_jsonl_with_flip
+
+
+def main():
+    """Load paths and run the final processing stage for the val dataset."""
+    paths = load_config()["paths"]
+
+    input_path = Path(paths["eval_flipped_out"])
+    output_path = Path(paths["final_eval"])
+
+    process_jsonl_with_flip(input_path=input_path, output_path=output_path)
+
+    print("val dataset final processing completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/aixpert/data_construction/utils/data_utils.py b/src/aixpert/data_construction/utils/data_utils.py
@@ -11,6 +11,8 @@
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 
+from utils.config_loader import load_config
+
 
 def extract_prompt(dialog: List[Dict[str, Any]]) -> str:
     """Extract the first user message."""
@@ -121,3 +123,28 @@ def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]:
         item["h_w"], item["h_l"] = 0, 1
         item["chosen"], item["rejected"] = item["rejected"], item["chosen"]
     return item
+
+
+def process_jsonl_with_flip(input_path: str, output_path: str):
+    """Remove keys"""
+    cfg = load_config()
+    keep_keys = cfg.dataset_processing.keep_keys
+
+    with open(input_path, "r") as f_in, open(output_path, "w") as f_out:
+        for line in f_in:
+            if not line.strip():
+                continue
+
+            data = json.loads(line)
+
+            if data.get("source") == "synthetic_inversion":
+                data["flipped"] = True
+
+            cleaned = {k: data.get(k) for k in keep_keys}
+
+            if cleaned.get("flipped") is None:
+                cleaned["flipped"] = False
+
+            f_out.write(json.dumps(cleaned) + "\n")
+
+    print(f"[✓] Saved processed file to: {output_path}")
diff --git a/src/aixpert/evaluation/README_.md b/src/aixpert/evaluation/README_.md
@@ -0,0 +1,172 @@
+# AIXpert Preference Alignment — Evaluation Pipeline
+GPT-4o-mini Judge · Factuality Scoring · Multi-Model Benchmarking
+
+This directory implements the **automated evaluation pipeline** used to benchmark:
+
+- Original-DPO models
+- Factual-DPO models (across Δ = 0, 2, 4, 6, 8, 10, 20, 30, 50, 100)
+
+Evaluation is performed using **GPT-4o-mini** as an LLM-as-a-judge.
+
+All evaluation configuration is pulled from:
+
+```
+src/aixpert/config/config.yaml
+```
+
+---
+
+## 📁 Evaluation Directory Structure
+
+```
+src/aixpert/evaluation/
+│
+├── evaluations/
+│   └── run_all_evaluations.py      # Main orchestrator
+│
+├── utils/
+│   ├── eval_core_utils.py          # Generation + GPT judge scoring
+│   └── eval_template.py            # Factual judge prompt
+```
+
+---
+
+# ⚙️ Configuration Overview (Evaluation)
+
+The configuration includes:
+
+---
+
+## 1️⃣ Evaluation Settings
+
+```yaml
+eval:
+  data_file: "src/aixpert/data_construction/data/skywork_extracted_test.jsonl"
+  batch_size: 16
+  max_new_tokens: 2048
+  judge_concurrency: 10
+```
+
+---
+
+## 2️⃣ Model Paths
+
+```yaml
+paths:
+  original_root: "src/aixpert/training/data/original/Models"
+  modified_root: "src/aixpert/training/data/modified/Models"
+```
+
+The evaluation script automatically locates checkpoints:
+
+```
+<short>_OriginalDPO/
+<short>_delta<value>/
+```
+
+---
+
+## 3️⃣ Model Registry & Δ Values
+
+```yaml
+models:
+  - short: "gemma2-9b"
+  - short: "llama3-8b"
+  - short: "qwen3-8b"
+```
+
+```yaml
+deltas: [0, 2, 4, 6, 8, 10, 20, 30, 50, 100]
+```
+
+Total evaluations:
+
+```
+7 models × 10 deltas = 70 comparisons
+```
+
+---
+
+## 🧠 Factual Judge Model
+
+```yaml
+model:
+  name: "gpt-4o-mini"
+  temperature: 0.8
+```
+
+---
+
+# 📊 Evaluation Metrics
+
+For each model pair, the pipeline computes:
+
+| Metric | Meaning |
+|--------|---------|
+| factuality_A | Mean factual score of Original-DPO model |
+| factuality_B | Mean factual score of Δ-model |
+| halluc_rate_A | % outputs scoring < 5 |
+| halluc_rate_B | % outputs scoring < 5 |
+| win_rate | How often Δ-model outperforms baseline |
+| count | Total prompts evaluated |
+
+Results saved to:
+
+```
+eval_results.json
+```
+
+---
+
+# 🚀 Running Evaluation
+
+```bash
+python -m aixpert.evaluation.evaluations.run_all_evaluations
+```
+
+The script:
+
+1. Loads config
+2. Loads evaluation prompts
+3. Loads Original-DPO and Δ-models
+4. Generates responses
+5. Sends to GPT-4o-mini asynchronously
+6. Computes metrics
+7. Saves results
+
+---
+
+# 🧩 Core Components
+
+## `eval_core_utils.py`
+
+Includes:
+
+- **batch_generate()** → Deterministic HF inference
+- **judge_factual()** → Scores one answer
+- **judge_many()** → Async batch scoring
+- **evaluate_pair()** → Full evaluation for one (model, Δ)
+
+---
+
+## `eval_template.py`
+
+Provides the factuality judge prompt using:
+
+```
+[[score]]
+```
+
+format.
+
+---
+
+# ✅ Summary
+
+This evaluation pipeline provides:
+
+- End-to-end factuality benchmarking
+- Async OpenAI judge scoring
+- Multi-model × multi-delta evaluation
+- Config-driven reproducibility
+- Clean JSON output for papers and analysis
diff --git a/src/aixpert/evaluation/config/config.yaml b/src/aixpert/evaluation/config/config.yaml
@@ -0,0 +1,26 @@
+eval:
+  data_file: "src/aixpert/data_construction/data/skywork_extracted_test.jsonl"
+  batch_size: 16
+  max_new_tokens: 2048
+  judge_concurrency: 10
+
+paths:
+  original_root: "src/aixpert/training/data/original/Models"
+  factual_root: "src/aixpert/training/data/factual/Models"
+
+models:
+  - short: "gemma2-9b"
+  - short: "qwen2.5-14b"
+  - short: "llama3.2-1b"
+  - short: "gemma2-2b"
+  - short: "llama3-8b"
+  - short: "qwen2-7b"
+  - short: "qwen3-8b"
+
+deltas: [0, 2, 4, 6, 8, 10, 20, 30, 50, 100]
+
+llm-as-judge:
+  name: gpt-4o-mini   # or gpt-4o
+  temperature: 0.8
+
+openai_api_key: "${OPENAI_API_KEY}"