Add MetaMedQA dataset

vrunm · vrunm · commit bccc2779437e · 2025-10-03T13:43:40.000+05:30
diff --git a/src/med_reason_evals/data/metamedqa.py b/src/med_reason_evals/data/metamedqa.py
@@ -0,0 +1,99 @@
+"""Load and process the MetaMedQA dataset.
+
+Dataset: HuggingFace `maximegmd/MetaMedQA` dataset.
+Each example is normalized to the fields expected by `vf.Verifiers`:
+{
+    "question": "<formatted question + options>",      # string used as the user prompt
+    "answer":   "<A|B|C|D|E>",                        # top-level gold letter
+    "info":     { ...original example fields... }      # full source row for debugging
+}
+"""
+
+from typing import Any
+
+from datasets import load_dataset
+
+
+class MetaMedQADataset:
+    """Process the MetaMedQA dataset."""
+
+    def __init__(
+        self,
+        split: str = "test",
+        num_examples: int = -1,
+    ):
+        """Initialize the MetaMedQA dataset processor.
+
+        Args:
+            split: Dataset split to use (train, validation, test)
+            num_examples: Number of examples to use (-1 for all)
+        """
+        self.split = split
+        self.num_examples = num_examples
+        self.rng_seed = 12345
+
+        # Load and process datasets on initialization
+        self.dataset = self._load_and_process_dataset()
+
+    def _load_and_process_dataset(self) -> Any:
+        """Load and process the MetaMedQA dataset."""
+        # Load the raw dataset
+        raw_ds = load_dataset("maximegmd/MetaMedQA", split=self.split)
+
+        # Limit number of examples if specified
+        if self.num_examples != -1:
+            raw_ds = raw_ds.select(range(min(self.num_examples, len(raw_ds))))
+
+        # Format dataset for verifiers
+        formatted_ds = self._format_for_verifiers(raw_ds)
+
+        # Shuffle dataset
+        return formatted_ds.shuffle(seed=self.rng_seed)
+
+    def _build_prompt(self, question: str, options: dict) -> str:
+        """Build prompt with question and options."""
+        opts = "\n".join(f"{k}. {v}" for k, v in options.items())
+        letters = ", ".join(sorted(options.keys()))
+        return (
+            "You are a clinician. Choose exactly ONE option letter.\n\n"
+            f"Question:\n{question}\n\n"
+            f"Options:\n{opts}\n\n"
+            f"Answer with ONLY the letter ({letters})."
+        )
+
+    def _format_for_verifiers(self, dataset: Any) -> Any:
+        """Format dataset for verifiers with question, answer, and info fields."""
+        valid = {"A", "B", "C", "D", "E"}
+
+        def format_row(row: dict) -> dict:
+            row = dict(row)
+
+            q: str = row["question"]
+            options: dict = row["options"]
+            gold_text: str = row["answer"]
+
+            # Find the gold letter by matching the answer text with options
+            gold_letter = None
+            for k, v in options.items():
+                if (v or "").strip().lower() == (gold_text or "").strip().lower():
+                    gold_letter = k
+                    break
+
+            # If we can't find a matching letter, return None to filter out
+            if gold_letter is None or gold_letter not in valid:
+                # Default to first option if no match found
+                gold_letter = next(iter(options.keys()))
+
+            # Build the user-visible question string (question + options)
+            question_str = self._build_prompt(q, options)
+
+            # Keep full original example under 'info'
+            info = dict(row)
+
+            return {
+                "question": question_str,
+                "answer": gold_letter,
+                "info": info,
+            }
+
+        return dataset.map(format_row)
diff --git a/src/med_reason_evals/verifiers/metamedqa.py b/src/med_reason_evals/verifiers/metamedqa.py
@@ -0,0 +1,111 @@
+"""MetaMedQA Evaluation.
+
+Dataset: HuggingFace `maximegmd/MetaMedQA` dataset.
+
+- Parser: Extracts first letter A-Z from completions
+- Reward Functions:
+    - Correct answer reward
+    - Format reward
+"""
+
+import os
+from typing import Any
+
+import verifiers as vf
+from dotenv import load_dotenv
+from openai import OpenAI
+
+from med_reason_evals.data.metamedqa import MetaMedQADataset
+from med_reason_evals.verifiers.answer_correctness_reward import (
+    correct_answer_reward_func,
+)
+
+
+class LetterParser:
+    """Parser that extracts the first letter (A-Z) from completions."""
+
+    def __init__(self) -> None:
+        """Initialize the LetterParser."""
+        pass
+
+    def parse_answer(self, completion: Any) -> str:
+        """Parse the completion to extract the first letter A-Z."""
+        text = self._get_text_from_completion(completion)
+        return self._first_letter(text) or ""
+
+    def get_format_reward_func(self) -> Any:
+        """Return a format reward function (simple placeholder)."""
+
+        def format_reward(
+            parser: Any, completion: str, answer: str, **kwargs: Any
+        ) -> float:
+            # Basic format reward - just check if we were able to extract a letter
+            parsed = self.parse_answer(completion)
+            return 1.0 if parsed != "" else 0.0
+
+        return format_reward
+
+    def _get_text_from_completion(self, completion: Any) -> str:
+        if isinstance(completion, str):
+            return completion
+        if isinstance(completion, list) and completion:
+            last = completion[-1]
+            if isinstance(last, dict):
+                return str(last.get("content", ""))
+            return str(last)
+        return str(completion)
+
+    def _first_letter(self, text: str) -> str:
+        t = (text or "").upper()
+        for ch in t:
+            if "A" <= ch <= "Z":
+                return ch
+        return ""
+
+
+def main() -> None:
+    """Run the evaluation on the MetaMedQA dataset."""
+    # Load environment variables
+    load_dotenv()
+
+    # Create an instance of the processor
+    dataset = MetaMedQADataset(split="test", num_examples=-1)
+
+    # Construct prompts
+    system_prompt = (
+        "Think step-by-step inside think> tags, then give only the letter "
+        "of the correct answer. Do not include option text; only the letter."
+    )
+
+    parser = LetterParser()
+
+    rubric = vf.Rubric(
+        funcs=[correct_answer_reward_func, parser.get_format_reward_func()],
+        weights=[1.0, 0.0],
+        parser=parser,
+    )
+
+    env = vf.SingleTurnEnv(
+        dataset=dataset.dataset,
+        eval_dataset=dataset.dataset,  # Using same dataset for both train and eval as in original
+        system_prompt=system_prompt,
+        parser=parser,
+        rubric=rubric,
+    )
+
+    # Run the evaluation
+    client = OpenAI(
+        api_key=os.getenv("GROQ_API_KEY"),
+        base_url="https://api.groq.com/v1",
+    )
+    results = env.evaluate(
+        client=client,
+        model="llama-3.3-70b-versatile",
+        num_examples=2,
+        rollouts_per_example=5,
+    )
+    print(results)
+
+
+if __name__ == "__main__":
+    main()