Add MedQA dataset

vrunm · vrunm · commit 4ef2cd49bf1c · 2025-10-07T07:11:23.000+05:30
diff --git a/src/med_reason_evals/data/medqa.py b/src/med_reason_evals/data/medqa.py
@@ -0,0 +1,97 @@
+"""Load and process the MedQA dataset.
+
+Dataset: HuggingFace `GBaker/MedQA-USMLE-4-options` dataset.
+Each example is normalized to the following fields:
+{
+    "question": "<question + formatted options>",  # string used as the user prompt
+    "answer":   "<A|B|C|D>",                       # top-level gold letter
+    "info":     { ...original example fields... }  # full source row for debugging
+}
+"""
+
+from typing import Any
+
+from datasets import load_dataset
+
+
+class MedQADataset:
+    """Process the MedQA dataset."""
+
+    def __init__(
+        self,
+        num_train_examples: int = -1,
+        num_test_examples: int = -1,
+    ):
+        """Initialize the MedQA dataset processor.
+
+        Args:
+            num_train_examples: Number of training examples to use (-1 for all)
+            num_test_examples: Number of test examples to use (-1 for all)
+        """
+        self.num_train_examples = num_train_examples
+        self.num_test_examples = num_test_examples
+        self.rng_seed = 12345
+
+        # Load and process datasets on initialization
+        self.train_ds, self.test_ds = self._load_and_process_datasets()
+
+    def _load_and_process_datasets(self) -> tuple:
+        """Load and process the MedQA datasets."""
+        # Load the raw datasets
+        ds = load_dataset("GBaker/MedQA-USMLE-4-options")
+        train_raw = ds["train"]
+        test_raw = ds["test"]
+
+        # Limit number of examples if specified
+        if self.num_train_examples != -1:
+            train_raw = train_raw.select(
+                range(min(self.num_train_examples, len(train_raw)))
+            )
+        if self.num_test_examples != -1:
+            test_raw = test_raw.select(
+                range(min(self.num_test_examples, len(test_raw)))
+            )
+
+        # Format datasets for verifiers
+        train_formatted = self._format_for_verifiers(train_raw, "train")
+        test_formatted = self._format_for_verifiers(test_raw, "test")
+
+        # Shuffle datasets
+        train_formatted = train_formatted.shuffle(seed=self.rng_seed)
+        test_formatted = test_formatted.shuffle(seed=self.rng_seed)
+
+        return train_formatted, test_formatted
+
+    def _format_for_verifiers(self, dataset: Any, split: str) -> Any:
+        """Format dataset for verifiers with question, answer, and info fields."""
+        valid = {"A", "B", "C", "D"}
+
+        def format_row(row: dict) -> dict:
+            row = dict(row)
+
+            # Build the user-visible question string (question + options)
+            q = row.get("question", "") or ""
+            opts = row.get("options", {}) or {}
+
+            question_str = f"Question: {q}\n"
+            for k, v in opts.items():
+                # Skip null or empty values
+                if v is not None and v != "":
+                    question_str += f"\n{k}. {v}"
+
+            # Lift the answer top-level, normalize to a single letter
+            ans = (row.get("answer_idx") or "").strip().upper()
+            if ans not in valid:
+                # Final guard: set to empty if unexpected
+                ans = ""
+
+            # Keep full original example under 'info'
+            info = dict(row)
+
+            return {
+                "question": question_str,
+                "answer": ans,
+                "info": info,
+            }
+
+        return dataset.map(format_row)
diff --git a/src/med_reason_evals/verifiers/medqa.py b/src/med_reason_evals/verifiers/medqa.py
@@ -0,0 +1,97 @@
+"""MedQA Evaluation.
+
+Dataset: HuggingFace `GBaker/MedQA-USMLE-4-options` dataset.
+
+- Parser: Extracts \\boxed{A|B|C|D} from completions
+- Reward Functions:
+    - Correct answer reward
+    - Format reward
+"""
+
+import os
+import verifiers as vf
+from dotenv import load_dotenv
+from openai import OpenAI
+from verifiers.utils.data_utils import extract_boxed_answer
+
+from med_reason_evals.data.medqa import MedQADataset
+from med_reason_evals.verifiers.exact_match_reward import (
+    exact_match_reward_func,
+)
+
+
+def load_environment(
+    use_think: bool = True,
+    num_train_examples: int = -1,
+    num_test_examples: int = -1,
+) -> vf.SingleTurnEnv:
+    """MedQA-USMLE-4-options multiple-choice evaluation.
+
+    Args:
+        use_think: Whether to require step-by-step reasoning (default: True)
+        num_train_examples: Number of training examples to use (-1 for all)
+        num_test_examples: Number of test examples to use (-1 for all)
+
+    Returns:
+        vf.SingleTurnEnv configured with MedQA dataset
+    """
+    dataset = MedQADataset(
+        num_train_examples=num_train_examples,
+        num_test_examples=num_test_examples,
+    )
+
+    options = "(A, B, C, or D)"  # MedQA has 4 options
+
+    system_prompt = (
+        f"Think step-by-step inside <think> tags, then give only the letter "
+        f"of the correct answer inside \\boxed{{...}} {options}. Do not include option "
+        f"text in the box; only the letter."
+    )
+
+    parser = vf.ThinkParser(extract_fn=extract_boxed_answer)
+
+    rubric = vf.Rubric(
+        funcs=[exact_match_reward_func, parser.get_format_reward_func()],
+        weights=[1.0, 0.0],  
+        parser=parser,
+    )
+
+    return vf.SingleTurnEnv(
+        dataset=dataset.train_ds,
+        eval_dataset=dataset.test_ds,
+        system_prompt=system_prompt,
+        parser=parser,
+        rubric=rubric,
+    )
+
+
+def main() -> None:
+    """Run the evaluation on the MedQA dataset."""
+    # Load environment variables
+    load_dotenv()
+
+    # Load environment
+    env = load_environment(
+        use_think=True,
+        num_train_examples=-1,
+        num_test_examples=-1,
+    )
+
+    # Initialize OpenAI-compatible client (e.g., Groq)
+    client = OpenAI(
+        api_key=os.getenv("GROQ_API_KEY"),
+        base_url="https://api.groq.com/openai/v1",  # Fixed URL (removed extra spaces)
+    )
+
+    # Run evaluation
+    results = env.evaluate(
+        client=client,
+        model="llama-3.3-70b-versatile",
+        num_examples=2,
+        rollouts_per_example=5,
+    )
+    print(results)
+
+
+if __name__ == "__main__":
+    main()