updated the evaluation type from model-guided to includes for two of the 4 evals.

sakher-sawan · sakher-sawan · commit 239b09bb4654 · 2024-05-17T09:35:43.000+01:00
diff --git a/evals/registry/data/quran_eval/gen_script/main.py b/evals/registry/data/quran_eval/gen_script/main.py
@@ -84,9 +84,10 @@ def generate_bilingual_questions(ayas_df, question_type):
             ideal_answer_ar = [row['name'], row['transliteration'], row['translation']]
 
         elif question_type == "surah_type":
-            question_content_en = f"Determine if the Surah of the following Quranic aya text is meccan or madinan: {row['text']} answer only with either 'meccan' or 'madinan' (exactly in small case)."
+            question_content_en = f"Determine if the Surah of the following Quranic aya text is meccan or medinan: {row['text']} answer only with either 'meccan' or 'medinan' (exactly in small case)."
             question_content_ar = f"حدد إذا كانت السورة للنص القرآني التالي مكية أو مدنية: {row['text']} أجب فقط بـ 'مكية' أو 'مدنية' (بدون تشكيل)."
             answer_arabic_translations = ['مكية', 'مكي', 'مكة'] if row['type'] == 'meccan' else ['مدنية', 'مدني', 'المدينة']
+            answer_english_translations = ['meccan', 'meccan', 'mecca', "maccan"] if row['type'] == 'meccan' else ['madinan', 'medinan', 'madina']
             all_answers = [row['type']] + answer_arabic_translations
             ideal_answer = all_answers
             ideal_answer_ar = all_answers
diff --git a/evals/registry/data/quran_eval/guess_quran_surah_type.jsonl b/evals/registry/data/quran_eval/guess_quran_surah_type.jsonl
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:725dbd3afa688a7cedbc6c7a5b65755ae9206005a4f46f9370b43792620d33b7
+oid sha256:50c10be59d2b0766a577b82da112f1a0f088f5cdb6531d366bec88140931c45b
 size 195173
diff --git a/evals/registry/evals/quran_eval.yaml b/evals/registry/evals/quran_eval.yaml
@@ -11,25 +11,23 @@ guess_quran_surah_name.dev.v0:
 
 guess_quran_surah_type:
   id: guess_quran_surah_type.dev.v0
-  description: Tests the model's ability to guess the type of a Quranic Surah (chapter) for a given verse (Aya) (e.g. Meccan or Medinan)
+  description: Tests the model's ability to guess the type of a Quranic Surah (chapter) for a given verse (Aya) (e.g., Meccan or Medinan)
   metrics: [accuracy]
 guess_quran_surah_type.dev.v0:
-  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
+  class: evals.elsuite.basic.includes:Includes
   args:
     samples_jsonl: quran_eval/guess_quran_surah_type.jsonl
-    eval_type: cot_classify
-    modelgraded_spec: simple_fact
+    ignore_case: true
+
 
 guess_which_text_is_from_quran:
   id: guess_which_text_is_from_quran.dev.v0
   description: Tests the model's ability to guess which text is from the Quran.
   metrics: [accuracy]
 guess_which_text_is_from_quran.dev.v0:
-  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
+  class: evals.elsuite.basic.includes:Includes
   args:
     samples_jsonl: quran_eval/guess_which_text_is_from_quran.jsonl
-    eval_type: cot_classify
-    modelgraded_spec: simple_fact
 
 masked_quranic_text:
   id: masked_quranic_text.dev.v0
@@ -40,4 +38,4 @@ masked_quranic_text.dev.v0:
   args:
     samples_jsonl: quran_eval/masked_quranic_text.jsonl
     eval_type: cot_classify
-    modelgraded_spec: simple_fact
+    modelgraded_spec: simple_fact