-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_training_data.py
More file actions
187 lines (142 loc) · 5.69 KB
/
generate_training_data.py
File metadata and controls
187 lines (142 loc) · 5.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import json
import os
import random
import time
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.environ["OPENROUTER_API_KEY"],
)
MODEL = "openai/gpt-5.4-mini" # Or any model on OpenRouter
def load_faqs(path="faqs.json"):
with open(path) as f:
return json.load(f)
def generate_variants(faq, num_variants=10):
"""Generate diverse question variants for a single FAQ entry."""
prompt = f"""You are generating synthetic training data for a customer support chatbot
for TAIKAI, a hackathon and open innovation platform.
Given this FAQ entry:
Topic: {faq['topic']}
Original Question: {faq['question']}
Answer: {faq['answer']}
Generate {num_variants} diverse, realistic ways a real user might ask this question.
Include variety in:
- Formality (casual to professional)
- Specificity (vague to detailed)
- Emotional tone (frustrated, confused, curious, urgent)
- Phrasing (questions, statements, complaints)
- Typos and informal language (some, not all)
Return ONLY a JSON array of strings, no other text. Example format:
["question 1", "question 2", "question 3"]"""
response = client.chat.completions.create(
model=MODEL,
max_tokens=2000,
messages=[{"role": "user", "content": prompt}],
)
raw = response.choices[0].message.content.strip()
# Clean potential markdown code fences
raw = raw.replace("```json", "").replace("```", "").strip()
variants = json.loads(raw)
return variants
def generate_answer_variants(faq, num_variants=3):
"""Generate slightly different answer phrasings to avoid overfitting."""
prompt = f"""You are writing answers for a customer support chatbot for TAIKAI,
a hackathon and open innovation platform.
Given this FAQ:
Question: {faq['question']}
Official Answer: {faq['answer']}
Generate {num_variants} different answer phrasings that:
- Contain the same factual information
- Vary in length (concise, medium, detailed)
- Sound natural and helpful
- Use slightly different wording each time
- Always remain accurate to the official answer
Return ONLY a JSON array of strings. Example format:
["answer 1", "answer 2", "answer 3"]"""
response = client.chat.completions.create(
model=MODEL,
max_tokens=3000,
messages=[{"role": "user", "content": prompt}],
)
raw = response.choices[0].message.content.strip()
raw = raw.replace("```json", "").replace("```", "").strip()
return json.loads(raw)
SYSTEM_PROMPT = (
"You are a helpful customer support assistant for TAIKAI, a hackathon and "
"open innovation platform. Answer questions accurately and concisely based "
"on your knowledge of TAIKAI's products and services."
)
def build_training_examples(faqs, questions_per_faq=10, answer_variants=3):
"""Build the full training dataset."""
training_data = []
for i, faq in enumerate(faqs):
print(f"Processing FAQ {i+1}/{len(faqs)}: {faq['question'][:50]}...")
# Generate question variants
try:
question_variants = generate_variants(faq, num_variants=questions_per_faq)
except (json.JSONDecodeError, Exception) as e:
print(f" Warning: Failed to generate question variants for FAQ {faq['id']}: {e}")
question_variants = []
# Generate answer variants
try:
answer_options = generate_answer_variants(faq, num_variants=answer_variants)
except (json.JSONDecodeError, Exception) as e:
print(f" Warning: Failed to generate answer variants for FAQ {faq['id']}: {e}")
answer_options = []
# Add the original answer too
answer_options.append(faq['answer'])
# Pair each question variant with a randomly selected answer variant
for q in question_variants:
a = random.choice(answer_options)
training_data.append({
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": q},
{"role": "assistant", "content": a}
],
"faq_id": faq["id"],
"topic": faq["topic"]
})
# Also include the original FAQ as a training example
training_data.append({
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": faq["question"]},
{"role": "assistant", "content": faq["answer"]}
],
"faq_id": faq["id"],
"topic": faq["topic"]
})
# Be nice to the API
time.sleep(1)
return training_data
def main():
faqs = load_faqs()
print(f"Loaded {len(faqs)} FAQs")
training_data = build_training_examples(
faqs,
questions_per_faq=10,
answer_variants=3
)
print(f"\nGenerated {len(training_data)} training examples")
# Shuffle the data
random.shuffle(training_data)
# Split into train/validation (90/10)
split_idx = int(len(training_data) * 0.9)
train_data = training_data[:split_idx]
val_data = training_data[split_idx:]
# Save
with open("train.jsonl", "w") as f:
for example in train_data:
f.write(json.dumps(example) + "\n")
with open("val.jsonl", "w") as f:
for example in val_data:
f.write(json.dumps(example) + "\n")
print(f"Saved {len(train_data)} training and {len(val_data)} validation examples")
# Print a few samples
print("\n--- Sample Training Examples ---")
for ex in train_data[:3]:
print(f"\nUser: {ex['messages'][1]['content']}")
print(f"Assistant: {ex['messages'][2]['content'][:100]}...")
if __name__ == "__main__":
main()