Skip to content

Commit 6c76796

Browse files
committed
Added generation script with seed to ensure the same qurstions are generated every time.
Generated a new set of evals based on the new generation script
1 parent 1b3f4a9 commit 6c76796

File tree

9 files changed

+7435
-8
lines changed

9 files changed

+7435
-8
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
generated
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
import pandas as pd
2+
import json
3+
import random
4+
import re
5+
import os
6+
7+
SEED = 42
8+
9+
def load_quran_data(file_path):
10+
return pd.read_csv(file_path, header=None, names=['chapter', 'verse', 'text'], sep='|')
11+
12+
def load_chapter_names(file_path):
13+
return pd.read_json(file_path)
14+
15+
def extract_random_ayas(df, number_of_ayas):
16+
random.seed(SEED)
17+
return df.sample(n=number_of_ayas, random_state=SEED)
18+
19+
def load_distractors(file_path):
20+
with open(file_path, 'r', encoding='utf-8') as file:
21+
distractors = json.load(file)
22+
return distractors
23+
24+
def generate_mcq_questions(aya, distractors_list, n=3):
25+
random.seed(SEED)
26+
correct_answer = aya['text']
27+
valid_distractors = [d for d in distractors_list if d != correct_answer]
28+
selected_distractors = random.sample(valid_distractors, n)
29+
30+
options = selected_distractors + [correct_answer]
31+
random.shuffle(options)
32+
33+
option_labels = ['A', 'B', 'C', 'D'][:len(options)]
34+
labeled_options = {label: option for label, option in zip(option_labels, options)}
35+
36+
options_text = "; ".join([f"{label}. {option}" for label, option in labeled_options.items()])
37+
question_content_en = f"Which of the following is a text from the Quran? {options_text}, please answer with the letter of the correct option (A, B, C, D) only"
38+
question_content_ar = f"أي من التالي هو نص من القرآن؟ {options_text}, يرجى الإجابة بحرف الخيار الصحيح (A, B, C, D) فقط"
39+
40+
correct_label = [label for label, option in labeled_options.items() if option == correct_answer][0]
41+
42+
return question_content_en, question_content_ar, correct_label
43+
44+
def redact_aya(text, all_texts):
45+
random.seed(SEED)
46+
words = text.split()
47+
if len(words) <= 1:
48+
return None
49+
50+
for _ in range(100):
51+
start = random.randint(0, len(words) - 1)
52+
end = random.randint(start + 1, len(words))
53+
first_section = ' '.join(words[:start])
54+
missing_section = ' '.join(words[start:end])
55+
third_section = ' '.join(words[end:])
56+
redacted_aya = f"{first_section} ________ {third_section}".strip()
57+
58+
pattern = re.escape(first_section) + '.*' + re.escape(third_section)
59+
if len([t for t in all_texts if re.match(pattern, t)]) == 1:
60+
return redacted_aya, first_section, missing_section, third_section
61+
62+
return None
63+
64+
def generate_bilingual_questions(ayas_df, question_type):
65+
random.seed(SEED)
66+
bilingual_questions = []
67+
half_length = len(ayas_df) // 2
68+
include_extra_info = True
69+
70+
for index, row in ayas_df.iterrows():
71+
extra_info_en = f" This text is from Surah {row['name']} (Chapter {row['chapter']}, Verse {row['verse']})." if include_extra_info else ""
72+
extra_info_ar = f" هذا النص القرآني من سورة {row['name']} (السورة {row['chapter']}، الآية {row['verse']})." if include_extra_info else ""
73+
74+
if question_type == "missing_text":
75+
question_content_en = f"Fill in the blank of the following Quranic text: (({row['redacted']})) to complete the full verse.{extra_info_en}. The answer may be one or more words."
76+
question_content_ar = f"املأ الفراغ في النص القرآني التالي: (({row['redacted']})) لإتمام الآية كاملة.{extra_info_ar}. قد تكون الإجابة عبارة عن كلمة واحدة أو أكثر."
77+
ideal_answer = [row['missing_section']]
78+
ideal_answer_ar = [row['missing_section']]
79+
80+
elif question_type == "surah_name":
81+
question_content_en = f"Identify the Surah (in Arabic) of the following Quranic text: {row['text']} (Please provide the answer without diacritics but keep hamza and madda)."
82+
question_content_ar = f"حدد اسم السورة للنص القرآني التالي: {row['text']} (يرجى تقديم الإجابة بدون تشكيل ولكن احتفظ بالهمزة والمد)."
83+
ideal_answer = [row['name'], row['transliteration'], row['translation']]
84+
ideal_answer_ar = [row['name'], row['transliteration'], row['translation']]
85+
86+
elif question_type == "surah_type":
87+
question_content_en = f"Determine if the Surah of the following Quranic aya text is meccan or madinan: {row['text']} answer only with either 'meccan' or 'madinan' (exactly in small case)."
88+
question_content_ar = f"حدد إذا كانت السورة للنص القرآني التالي مكية أو مدنية: {row['text']} أجب فقط بـ 'مكية' أو 'مدنية' (بدون تشكيل)."
89+
answer_arabic_translations = ['مكية', 'مكي', 'مكة'] if row['type'] == 'meccan' else ['مدنية', 'مدني', 'المدينة']
90+
all_answers = [row['type']] + answer_arabic_translations
91+
ideal_answer = all_answers
92+
ideal_answer_ar = all_answers
93+
94+
elif question_type == "mcq":
95+
question_content_en, question_content_ar, correct_label = generate_mcq_questions(row, distractors_list)
96+
ideal_answer = [correct_label]
97+
ideal_answer_ar = [correct_label]
98+
99+
# Creating questions in both English and Arabic
100+
if index < half_length: # English questions
101+
bilingual_questions.append({
102+
"input": [
103+
{"role": "system", "content": question_content_en},
104+
{"role": "user", "content": "Please provide the answer, and ONLY the answer without any extra commentary" if question_type != "mcq" else "Please provide the answer by selecting the correct letter (A, B, C, or D) without any extra commentary"}
105+
],
106+
"ideal": ideal_answer
107+
})
108+
else: # Arabic questions
109+
bilingual_questions.append({
110+
"input": [
111+
{"role": "system", "content": question_content_ar},
112+
{"role": "user", "content": "يرجى تقديم الإجابة. وفقط الإجابة دون أي تعليق إضافي" if question_type != "mcq" else "يرجى تقديم الإجابة عن طريق تحديد الحرف الصحيح (A, B, C, أو D) دون أي تعليق إضافي"}
113+
],
114+
"ideal": ideal_answer_ar
115+
})
116+
117+
# Toggle extra info for next question
118+
include_extra_info = not include_extra_info
119+
120+
return bilingual_questions
121+
122+
if __name__ == '__main__':
123+
# Main process
124+
quran_file_path = 'evals/registry/data/quran_eval/gen_script/resources/Arabic-Original.csv'
125+
chapters_file_path = 'evals/registry/data/quran_eval/gen_script/resources/chapters-en.json'
126+
distractors_file_path = 'evals/registry/data/quran_eval/gen_script/resources/distractors_not_quranic.json'
127+
128+
random.seed(SEED)
129+
130+
# Load and prepare data
131+
quran_df = load_quran_data(quran_file_path)
132+
chapters_df = load_chapter_names(chapters_file_path)
133+
random_ayas_df = extract_random_ayas(quran_df, 350)
134+
distractors_list = load_distractors(distractors_file_path)
135+
136+
random_ayas_df = random_ayas_df.merge(chapters_df, left_on='chapter', right_on='id')
137+
random_ayas_df.drop(columns=['id', 'total_verses'], inplace=True)
138+
139+
# Apply the redaction process and validation
140+
all_texts = quran_df['text'].tolist()
141+
validated_ayas = []
142+
143+
for index, row in random_ayas_df.iterrows():
144+
result = redact_aya(row['text'], all_texts)
145+
if result:
146+
row['redacted'], row['first_section'], row['missing_section'], row['third_section'] = result
147+
pattern = row["text"]
148+
if len([t for t in all_texts if re.match(pattern, t)]) == 1:
149+
validated_ayas.append(row)
150+
151+
validated_ayas_df = pd.DataFrame(validated_ayas)
152+
153+
# Generate bilingual questions
154+
bilingual_missing_text_questions = generate_bilingual_questions(validated_ayas_df, "missing_text")
155+
bilingual_surah_name_questions = generate_bilingual_questions(validated_ayas_df, "surah_name")
156+
bilingual_surah_type_questions = generate_bilingual_questions(validated_ayas_df, "surah_type")
157+
# Generate MCQ questions
158+
question_type = "mcq"
159+
mcq_questions = generate_bilingual_questions(random_ayas_df, question_type)
160+
161+
# Save the questions to separate JSON files
162+
readable_bilingual_missing_text_file_path = 'evals/registry/data/quran_eval/gen_script/generated/masked_quranic_text.json'
163+
readable_bilingual_surah_name_file_path = 'evals/registry/data/quran_eval/gen_script/generated/guess_quran_surah_name.json'
164+
readable_bilingual_surah_type_file_path = 'evals/registry/data/quran_eval/gen_script/generated/guess_quran_surah_type.json'
165+
readable_biligual_questions_mcq_file_path = 'evals/registry/data/quran_eval/gen_script/generated/guess_which_text_is_from_quran.json'
166+
167+
output_folder = 'evals/registry/data/quran_eval/gen_script/generated'
168+
169+
# Create the output folder if it doesn't exist
170+
if not os.path.exists(output_folder):
171+
os.makedirs(output_folder)
172+
173+
with open(readable_bilingual_missing_text_file_path, 'w', encoding='utf-8') as file:
174+
json.dump(bilingual_missing_text_questions, file, ensure_ascii=False, indent=4)
175+
176+
with open(readable_bilingual_surah_name_file_path, 'w', encoding='utf-8') as file:
177+
json.dump(bilingual_surah_name_questions, file, ensure_ascii=False, indent=4)
178+
179+
with open(readable_bilingual_surah_type_file_path, 'w', encoding='utf-8') as file:
180+
json.dump(bilingual_surah_type_questions, file, ensure_ascii=False, indent=4)
181+
182+
with open(readable_biligual_questions_mcq_file_path, 'w', encoding='utf-8') as file:
183+
json.dump(mcq_questions, file, ensure_ascii=False, indent=4)
184+
185+
# Final output paths for each question type
186+
missing_text_output_jsonl = 'evals/registry/data/quran_eval/masked_quranic_text.jsonl'
187+
surah_name_output_jsonl = 'evals/registry/data/quran_eval/guess_quran_surah_name.jsonl'
188+
surah_type_output_jsonl = 'evals/registry/data/quran_eval/guess_quran_surah_type.jsonl'
189+
mcq_output_jsonl = 'evals/registry/data/quran_eval/guess_which_text_is_from_quran.jsonl'
190+
191+
output_folder = 'evals/registry/data/quran_eval'
192+
193+
# Create the output folder if it doesn't exist
194+
if not os.path.exists(output_folder):
195+
os.makedirs(output_folder)
196+
197+
# Save the questions in JSON Lines format for each question type
198+
with open(missing_text_output_jsonl, 'w', encoding='utf-8') as file:
199+
for question in bilingual_missing_text_questions:
200+
# Write each question as a separate line in the file
201+
json_line = json.dumps(question, ensure_ascii=False)
202+
file.write(json_line + '\n')
203+
204+
with open(surah_name_output_jsonl, 'w', encoding='utf-8') as file:
205+
for question in bilingual_surah_name_questions:
206+
# Write each question as a separate line in the file
207+
json_line = json.dumps(question, ensure_ascii=False)
208+
file.write(json_line + '\n')
209+
210+
with open(surah_type_output_jsonl, 'w', encoding='utf-8') as file:
211+
for question in bilingual_surah_type_questions:
212+
# Write each question as a separate line in the file
213+
json_line = json.dumps(question, ensure_ascii=False)
214+
file.write(json_line + '\n')
215+
216+
with open(mcq_output_jsonl, 'w', encoding='utf-8') as file:
217+
for question in mcq_questions:
218+
# Write each question as a separate line in the file
219+
json_line = json.dumps(question, ensure_ascii=False)
220+
file.write(json_line + '\n')

0 commit comments

Comments
 (0)