Added Quran Eval

sakher · sakher · commit 1a13a3437c80 · 2024-04-01T05:26:56.000+04:00
diff --git a/evals/registry/data/quran_eval/guess_quran_verse_name.jsonl b/evals/registry/data/quran_eval/guess_quran_verse_name.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d102c8042318245beb349794ff93a27ce7f2c76cb6d9d0a11a2c81c2b3b7ce9c
+size 157821
diff --git a/evals/registry/data/quran_eval/guess_quran_verse_type.jsonl b/evals/registry/data/quran_eval/guess_quran_verse_type.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26c157f176fe241c96a2f38a9eab980e21cb28393d74e903032fba3b28314bad
+size 180906
diff --git a/evals/registry/data/quran_eval/guess_which_text_is_from_quran.jsonl b/evals/registry/data/quran_eval/guess_which_text_is_from_quran.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:219103374aaef27cb66e295896b3afbb1af2543ba7315639b3b72e4df49b09a5
+size 823143
diff --git a/evals/registry/data/quran_eval/masked_quranic_text.jsonl b/evals/registry/data/quran_eval/masked_quranic_text.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0ad95b6ac9ef3b3e6e0bd24e5a00a3967a7f5ffaacb6c2f7337090db3a1aa88
+size 200064
diff --git a/evals/registry/eval_sets/quran-evals.yaml b/evals/registry/eval_sets/quran-evals.yaml
@@ -0,0 +1,6 @@
+quran-evals:
+  evals:
+    - guess_quran_verse_name
+    - guess_quran_verse_type
+    - guess_which_text_is_from_quran
+    - masked_quranic_text
diff --git a/evals/registry/evals/quran_eval.yaml b/evals/registry/evals/quran_eval.yaml
@@ -0,0 +1,43 @@
+guess_quran_verse_name:
+  id: guess_quran_verse_name.dev.v0
+  description: Tests the model's ability to guess the name of a Quranic verse.
+  metrics: [accuracy]
+guess_quran_verse_name.dev.v0:
+  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
+  args:
+    samples_jsonl: quran_eval/guess_quran_verse_name.jsonl
+    eval_type: cot_classify
+    modelgraded_spec: simple_fact
+
+guess_quran_verse_type:
+  id: guess_quran_verse_type.dev.v0
+  description: Tests the model's ability to guess the type of a Quranic verse (e.g. Meccan or Medinan)
+  metrics: [accuracy]
+guess_quran_verse_type.dev.v0:
+  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
+  args:
+    samples_jsonl: quran_eval/guess_quran_verse_type.jsonl
+    eval_type: cot_classify
+    modelgraded_spec: simple_fact
+
+guess_which_text_is_from_quran:
+  id: guess_which_text_is_from_quran.dev.v0
+  description: Tests the model's ability to guess which text is from the Quran.
+  metrics: [accuracy]
+guess_which_text_is_from_quran.dev.v0:
+  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
+  args:
+    samples_jsonl: quran_eval/guess_which_text_is_from_quran.jsonl
+    eval_type: cot_classify
+    modelgraded_spec: simple_fact
+
+masked_quranic_text:
+  id: masked_quranic_text.dev.v0
+  description: Tests the model's ability to predict masked Quranic text.
+  metrics: [accuracy]
+masked_quranic_text.dev.v0:
+  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
+  args:
+    samples_jsonl: quran_eval/masked_quranic_text.jsonl
+    eval_type: cot_classify
+    modelgraded_spec: simple_fact
diff --git a/evals/registry/modelgraded/simple_fact.yaml b/evals/registry/modelgraded/simple_fact.yaml
@@ -0,0 +1,23 @@
+simple_fact:
+  prompt: |-
+    You are tasked with comparing a submitted answer to an expert answer on a specific question. Here are the details provided:
+    [BEGIN DATA]
+    ************
+    [Question]: {input}
+    ************
+    [Expert Answer]: {ideal}
+    ************
+    [Submitted Answer]: {completion}
+    ************
+    [END DATA]
+
+    Based on the factual content of the submitted answer compared to the expert answer, determine if the submitted answer is valid and complete. Consider only the factual accuracy and completeness of the answer, ignoring differences in style, grammar, or punctuation.
+    Provide your judgment as either 'Y' for a valid and complete answer or 'N' for an invalid or incomplete answer. Your evaluation should focus on the following criteria:
+    1. Does the submitted answer accurately reflect the factual content of the expert answer?
+    2. Is the submitted answer complete, without missing any crucial details provided in the expert answer?
+  choice_scores:
+    "Y": 1.0
+    "N": 0.0
+  choice_strings: YN
+  input_outputs:
+    input: completion

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:d102c8042318245beb349794ff93a27ce7f2c76cb6d9d0a11a2c81c2b3b7ce9c`
	`3`	`+size 157821`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:26c157f176fe241c96a2f38a9eab980e21cb28393d74e903032fba3b28314bad`
	`3`	`+size 180906`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:219103374aaef27cb66e295896b3afbb1af2543ba7315639b3b72e4df49b09a5`
	`3`	`+size 823143`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:d0ad95b6ac9ef3b3e6e0bd24e5a00a3967a7f5ffaacb6c2f7337090db3a1aa88`
	`3`	`+size 200064`