Skip to content

Commit 54fedbc

Browse files
author
sindchad
committed
Fix workflows
1 parent db83f97 commit 54fedbc

File tree

2 files changed

+138
-0
lines changed

2 files changed

+138
-0
lines changed

.github/workflows/code_checks.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,5 @@ jobs:
5555
uses: pypa/gh-action-pip-audit@1220774d901786e6f652ae159f7b6bc8fea6d266
5656
with:
5757
virtual-environment: .venv/
58+
strict: false
59+
args: --ignore PYSEC-2024-161
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""
2+
Skywork extraction utilities.
3+
4+
This module extracts prompt/chosen/rejected fields from the Skywork Preference
5+
dataset, removes exact duplicates, and writes the cleaned dataset to JSONL
6+
files. Fully compatible with ruff, mypy, and the AI Engineering template.
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import json
12+
import os
13+
from typing import Any, Dict, List
14+
15+
import pandas as pd
16+
from datasets import load_dataset
17+
18+
19+
SUBSET_SIZE = 80000
20+
OUT_FILE = (
21+
"/projects/aixpert/users/sindhu/Loss_Test/Factual_Trust_Loss/data_creation/data/"
22+
"skywork_extracted_15k.jsonl"
23+
)
24+
REMOVED_FILE = (
25+
"/projects/aixpert/users/sindhu/Loss_Test/Factual_Trust_Loss/data_creation/data/"
26+
"skywork_cleaned_15k.jsonl"
27+
)
28+
29+
print(f"📥 Loading first {SUBSET_SIZE} samples from Skywork...")
30+
31+
32+
# ============================================================
33+
# Dataset loading
34+
# ============================================================
35+
ds = load_dataset(
36+
"Skywork/Skywork-Reward-Preference-80K-v0.1",
37+
split=f"train[:{SUBSET_SIZE}]",
38+
)
39+
40+
df = ds.to_pandas()
41+
42+
43+
# ============================================================
44+
# Extract prompt / chosen / rejected
45+
# ============================================================
46+
def extract_prompt_from_dialog(dialog: List[Dict[str, Any]]) -> str:
47+
"""
48+
Extract the first user message from a dialog.
49+
50+
Parameters
51+
----------
52+
dialog : list of dict
53+
A list of message objects with "role" and "content" keys.
54+
55+
Returns
56+
-------
57+
str
58+
The content of the first message with role 'user', or an empty string.
59+
"""
60+
for msg in dialog:
61+
if msg.get("role") == "user":
62+
return str(msg.get("content", "")).strip()
63+
return ""
64+
65+
66+
def extract_answer_from_dialog(dialog: List[Dict[str, Any]]) -> str:
67+
"""
68+
Extract the first assistant message from a dialog.
69+
70+
Parameters
71+
----------
72+
dialog : list of dict
73+
A list of message objects with "role" and "content" keys.
74+
75+
Returns
76+
-------
77+
str
78+
The content of the first message with role 'assistant', or an empty string.
79+
"""
80+
for msg in dialog:
81+
if msg.get("role") == "assistant":
82+
return str(msg.get("content", "")).strip()
83+
return ""
84+
85+
86+
df["prompt"] = df["chosen"].apply(extract_prompt_from_dialog)
87+
df["chosen"] = df["chosen"].apply(extract_answer_from_dialog)
88+
df["rejected"] = df["rejected"].apply(extract_answer_from_dialog)
89+
90+
clean_df = df[["prompt", "chosen", "rejected"]]
91+
92+
# ============================================================
93+
# 🔍 Exact-match removal (chosen == rejected)
94+
# ============================================================
95+
cleaned: List[Dict[str, str]] = []
96+
removed: List[Dict[str, str]] = []
97+
98+
for _, row in clean_df.iterrows():
99+
chosen = str(row["chosen"]).strip()
100+
rejected = str(row["rejected"]).strip()
101+
102+
sample = {
103+
"prompt": str(row["prompt"]).strip(),
104+
"chosen": chosen,
105+
"rejected": rejected,
106+
}
107+
108+
if chosen == rejected:
109+
removed.append(sample)
110+
else:
111+
cleaned.append(sample)
112+
113+
print(f"🧹 Removed exact duplicates: {len(removed)}")
114+
print(f"📦 Remaining clean samples: {len(cleaned)}")
115+
116+
# Ensure output directory exists
117+
os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)
118+
119+
120+
# ============================================================
121+
# Save output JSONL files
122+
# ============================================================
123+
def write_jsonl(path: str, rows: List[Dict[str, Any]]) -> None:
124+
"""Write a list of dictionaries to a JSONL file."""
125+
with open(path, "w", encoding="utf-8") as f:
126+
for row in rows:
127+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
128+
129+
130+
write_jsonl(OUT_FILE, cleaned)
131+
write_jsonl(REMOVED_FILE, removed)
132+
133+
print(f"✅ Saved cleaned dataset ({len(cleaned)} samples) → {OUT_FILE}")
134+
print(f"🗑️ Saved removed duplicates ({len(removed)} samples) → {REMOVED_FILE}")
135+
136+
print(pd.DataFrame(cleaned).head())

0 commit comments

Comments
 (0)