Skip to content

Commit c9024be

Browse files
authored
Merge pull request #7 from VectorInstitute/training
Training and Evaluation
2 parents 4336cc2 + ff1bfa4 commit c9024be

File tree

124 files changed

+47120
-3
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

124 files changed

+47120
-3
lines changed

.github/workflows/code_checks.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ on:
2424
- uv.lock
2525
- pyproject.toml
2626
- '**.ipynb'
27+
paths-ignore:
28+
- "trl/**"
29+
- "**/factualdpo_trainer.py"
2730

2831
jobs:
2932
run-code-check:

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ repos:
4949
- id: typos
5050
args: []
5151

52+
5253
- repo: https://github.com/nbQA-dev/nbQA
5354
rev: 1.9.1
5455
hooks:

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,10 @@ mypy_path = "src"
120120
# -----------------------------------------------------
121121
[tool.ruff]
122122
include = ["*.py", "pyproject.toml", "*.ipynb"]
123-
exclude = []
123+
exclude = [
124+
"trl",
125+
"**/factualdpo_trainer.py",
126+
]
124127
line-length = 88
125128

126129
[tool.ruff.format]

src/aixpert/data_construction/config/config.yaml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
repository: /projects/aixpert/users/sindhu/Loss_Test
2-
31
model:
42
name: gpt-4o-mini # or gpt-4o
53
temperature: 0.8
@@ -36,6 +34,9 @@ paths:
3634
train_flipped_out: "src/aixpert/data_construction/data/train_balanced_flipped.jsonl"
3735
eval_flipped_out: "src/aixpert/data_construction/data/eval_final_flipped.jsonl"
3836

37+
final_train: "src/aixpert/data_construction/data/train_final_processed.jsonl"
38+
final_eval: "src/aixpert/data_construction/data/eval_final_processed.jsonl"
39+
3940

4041

4142
skywork_file: "Skywork/Skywork-Reward-Preference-80K-v0.1"
@@ -59,3 +60,8 @@ hyperparams:
5960
"(1,1)": 10000
6061

6162
eval_additional_clean_samples: 1500
63+
64+
dataset_processing:
65+
keep_keys: ["prompt", "chosen", "rejected", "h_w", "h_l", "flipped"]
66+
67+
openai_api_key: "${OPENAI_API_KEY}"
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""This script performs the final cleanup step for the train dataset (flipping + key filtering)."""
2+
3+
from __future__ import annotations
4+
5+
from pathlib import Path
6+
7+
from utils.config_loader import load_config
8+
from utils.data_utils import process_jsonl_with_flip
9+
10+
11+
def main():
12+
"""Load paths and run the final processing stage for the train dataset."""
13+
paths = load_config()["paths"]
14+
15+
input_path = Path(paths["train_flipped_out"])
16+
output_path = Path(paths["final_train"])
17+
18+
process_jsonl_with_flip(input_path=input_path, output_path=output_path)
19+
20+
print("Train dataset final processing completed.")
21+
22+
23+
if __name__ == "__main__":
24+
main()
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""This script performs the final cleanup step for the train dataset (flipping + key filtering)."""
2+
3+
from __future__ import annotations
4+
5+
from pathlib import Path
6+
7+
from utils.config_loader import load_config
8+
from utils.data_utils import process_jsonl_with_flip
9+
10+
11+
def main():
12+
"""Load paths and run the final processing stage for the val dataset."""
13+
paths = load_config()["paths"]
14+
15+
input_path = Path(paths["eval_flipped_out"])
16+
output_path = Path(paths["final_eval"])
17+
18+
process_jsonl_with_flip(input_path=input_path, output_path=output_path)
19+
20+
print("val dataset final processing completed.")
21+
22+
23+
if __name__ == "__main__":
24+
main()

src/aixpert/data_construction/utils/data_utils.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
from pathlib import Path
1212
from typing import Any, Dict, List, Tuple
1313

14+
from utils.config_loader import load_config
15+
1416

1517
def extract_prompt(dialog: List[Dict[str, Any]]) -> str:
1618
"""Extract the first user message."""
@@ -121,3 +123,28 @@ def flip_sample(item: Dict[str, Any]) -> Dict[str, Any]:
121123
item["h_w"], item["h_l"] = 0, 1
122124
item["chosen"], item["rejected"] = item["rejected"], item["chosen"]
123125
return item
126+
127+
128+
def process_jsonl_with_flip(input_path: str, output_path: str):
129+
"""Remove keys"""
130+
cfg = load_config()
131+
keep_keys = cfg.dataset_processing.keep_keys
132+
133+
with open(input_path, "r") as f_in, open(output_path, "w") as f_out:
134+
for line in f_in:
135+
if not line.strip():
136+
continue
137+
138+
data = json.loads(line)
139+
140+
if data.get("source") == "synthetic_inversion":
141+
data["flipped"] = True
142+
143+
cleaned = {k: data.get(k) for k in keep_keys}
144+
145+
if cleaned.get("flipped") is None:
146+
cleaned["flipped"] = False
147+
148+
f_out.write(json.dumps(cleaned) + "\n")
149+
150+
print(f"[✓] Saved processed file to: {output_path}")

src/aixpert/evaluation/README_.md

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
# AIXpert Preference Alignment — Evaluation Pipeline
2+
GPT-4o-mini Judge · Factuality Scoring · Multi-Model Benchmarking
3+
4+
This directory implements the **automated evaluation pipeline** used to benchmark:
5+
6+
- Original-DPO models
7+
- Factual-DPO models (across Δ = 0, 2, 4, 6, 8, 10, 20, 30, 50, 100)
8+
9+
Evaluation is performed using **GPT-4o-mini** as an LLM-as-a-judge.
10+
11+
All evaluation configuration is pulled from:
12+
13+
```
14+
src/aixpert/config/config.yaml
15+
```
16+
17+
---
18+
19+
## 📁 Evaluation Directory Structure
20+
21+
```
22+
src/aixpert/evaluation/
23+
24+
├── evaluations/
25+
│ └── run_all_evaluations.py # Main orchestrator
26+
27+
├── utils/
28+
│ ├── eval_core_utils.py # Generation + GPT judge scoring
29+
│ └── eval_template.py # Factual judge prompt
30+
```
31+
32+
---
33+
34+
# ⚙️ Configuration Overview (Evaluation)
35+
36+
The configuration includes:
37+
38+
---
39+
40+
## 1️⃣ Evaluation Settings
41+
42+
```yaml
43+
eval:
44+
data_file: "src/aixpert/data_construction/data/skywork_extracted_test.jsonl"
45+
batch_size: 16
46+
max_new_tokens: 2048
47+
judge_concurrency: 10
48+
```
49+
50+
---
51+
52+
## 2️⃣ Model Paths
53+
54+
```yaml
55+
paths:
56+
original_root: "src/aixpert/training/data/original/Models"
57+
modified_root: "src/aixpert/training/data/modified/Models"
58+
```
59+
60+
The evaluation script automatically locates checkpoints:
61+
62+
```
63+
<short>_OriginalDPO/
64+
<short>_delta<value>/
65+
```
66+
67+
---
68+
69+
## 3️⃣ Model Registry & Δ Values
70+
71+
```yaml
72+
models:
73+
- short: "gemma2-9b"
74+
- short: "llama3-8b"
75+
- short: "qwen3-8b"
76+
```
77+
78+
```yaml
79+
deltas: [0, 2, 4, 6, 8, 10, 20, 30, 50, 100]
80+
```
81+
82+
Total evaluations:
83+
84+
```
85+
7 models × 10 deltas = 70 comparisons
86+
```
87+
88+
---
89+
90+
## 🧠 Factual Judge Model
91+
92+
```yaml
93+
model:
94+
name: "gpt-4o-mini"
95+
temperature: 0.8
96+
```
97+
98+
---
99+
100+
# 📊 Evaluation Metrics
101+
102+
For each model pair, the pipeline computes:
103+
104+
| Metric | Meaning |
105+
|--------|---------|
106+
| factuality_A | Mean factual score of Original-DPO model |
107+
| factuality_B | Mean factual score of Δ-model |
108+
| halluc_rate_A | % outputs scoring < 5 |
109+
| halluc_rate_B | % outputs scoring < 5 |
110+
| win_rate | How often Δ-model outperforms baseline |
111+
| count | Total prompts evaluated |
112+
113+
Results saved to:
114+
115+
```
116+
eval_results.json
117+
```
118+
119+
---
120+
121+
# 🚀 Running Evaluation
122+
123+
```bash
124+
python -m aixpert.evaluation.evaluations.run_all_evaluations
125+
```
126+
127+
The script:
128+
129+
1. Loads config
130+
2. Loads evaluation prompts
131+
3. Loads Original-DPO and Δ-models
132+
4. Generates responses
133+
5. Sends to GPT-4o-mini asynchronously
134+
6. Computes metrics
135+
7. Saves results
136+
137+
---
138+
139+
# 🧩 Core Components
140+
141+
## `eval_core_utils.py`
142+
143+
Includes:
144+
145+
- **batch_generate()** → Deterministic HF inference
146+
- **judge_factual()** → Scores one answer
147+
- **judge_many()** → Async batch scoring
148+
- **evaluate_pair()** → Full evaluation for one (model, Δ)
149+
150+
---
151+
152+
## `eval_template.py`
153+
154+
Provides the factuality judge prompt using:
155+
156+
```
157+
[[score]]
158+
```
159+
160+
format.
161+
162+
---
163+
164+
# ✅ Summary
165+
166+
This evaluation pipeline provides:
167+
168+
- End-to-end factuality benchmarking
169+
- Async OpenAI judge scoring
170+
- Multi-model × multi-delta evaluation
171+
- Config-driven reproducibility
172+
- Clean JSON output for papers and analysis
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
eval:
2+
data_file: "src/aixpert/data_construction/data/skywork_extracted_test.jsonl"
3+
batch_size: 16
4+
max_new_tokens: 2048
5+
judge_concurrency: 10
6+
7+
paths:
8+
original_root: "src/aixpert/training/data/original/Models"
9+
factual_root: "src/aixpert/training/data/factual/Models"
10+
11+
models:
12+
- short: "gemma2-9b"
13+
- short: "qwen2.5-14b"
14+
- short: "llama3.2-1b"
15+
- short: "gemma2-2b"
16+
- short: "llama3-8b"
17+
- short: "qwen2-7b"
18+
- short: "qwen3-8b"
19+
20+
deltas: [0, 2, 4, 6, 8, 10, 20, 30, 50, 100]
21+
22+
llm-as-judge:
23+
name: gpt-4o-mini # or gpt-4o
24+
temperature: 0.8
25+
26+
openai_api_key: "${OPENAI_API_KEY}"

0 commit comments

Comments
 (0)