Add PubMedQA dataset

vrunm · vrunm · commit ab8bfd982a91 · 2025-10-07T07:03:44.000+05:30
diff --git a/src/med_reason_evals/data/pubmedqa.py b/src/med_reason_evals/data/pubmedqa.py
@@ -0,0 +1,130 @@
+"""Load and process the PubMedQA dataset.
+
+Dataset: HuggingFace `qiaojin/PubMedQA` dataset.
+Each example is normalized to the following fields:
+{
+    "question": "<formatted question with context>",  # complete prompt with abstract
+    "answer":   "<A|B|C>",                           # A=yes, B=no, C=maybe
+    "info":     { ...original example fields... }    # full source row for debugging
+}
+"""
+
+import json
+import os
+from typing import Any
+
+from datasets import load_dataset
+
+
+class PubMedQADataset:
+    """Process the PubMedQA dataset."""
+
+    def __init__(
+        self,
+        num_train_examples: int = -1,
+        num_test_examples: int = -1,
+    ):
+        """Initialize the PubMedQA dataset processor.
+
+        Args:
+            num_train_examples: Number of training examples to use (-1 for all)
+            num_test_examples: Number of test examples to use (-1 for all)
+        """
+        self.num_train_examples = num_train_examples
+        self.num_test_examples = num_test_examples
+        self.rng_seed = 12345
+        self.dataset_path = "qiaojin/PubMedQA"
+
+        # Load and process datasets on initialization
+        self.train_ds, self.test_ds = self._load_and_process_datasets()
+
+    def _load_and_process_datasets(self) -> tuple:
+        """Load and process the PubMedQA datasets."""
+        # Load the raw datasets
+        # pqa_artificial is the training set, pqa_labeled is the test set
+        train_raw = load_dataset(
+            self.dataset_path, name="pqa_artificial", split="train"
+        )
+        test_raw = load_dataset(self.dataset_path, name="pqa_labeled", split="train")
+
+        # Filter test set to only include human-annotated samples
+        test_raw = self._filter_test_set(test_raw)
+
+        # Limit number of examples if specified
+        if self.num_train_examples != -1:
+            train_raw = train_raw.select(
+                range(min(self.num_train_examples, len(train_raw)))
+            )
+        if self.num_test_examples != -1:
+            test_raw = test_raw.select(
+                range(min(self.num_test_examples, len(test_raw)))
+            )
+
+        # Format datasets
+        train_formatted = self._format_dataset(train_raw, "train")
+        test_formatted = self._format_dataset(test_raw, "test")
+
+        # Shuffle datasets
+        train_formatted = train_formatted.shuffle(seed=self.rng_seed)
+        test_formatted = test_formatted.shuffle(seed=self.rng_seed)
+
+        return train_formatted, test_formatted
+
+    def _filter_test_set(self, dataset: Any) -> Any:
+        """Filter test set to only include human-annotated samples (500 from 1000)."""
+        # Load the predefined test IDs
+        here = os.path.dirname(__file__)
+        file_path = os.path.join(here, "data", "test_ground_truth.json")
+
+        try:
+            with open(file_path) as f:
+                test_ids = json.load(f)
+
+            # Filter to only the 500 human-annotated samples
+            return dataset.filter(lambda sample: str(sample["pubid"]) in test_ids)
+        except FileNotFoundError:
+            # If the file doesn't exist, return the full test set
+            print(f"Warning: {file_path} not found. Using full test set.")
+            return dataset
+
+    def _format_dataset(self, dataset: Any, split: str) -> Any:
+        """Format dataset with question, answer, and info fields."""
+        choices_map = {"yes": "A", "no": "B", "maybe": "C"}
+        prompt_template = "Answer A for yes, B for no or C for maybe.\n\nContext: {context}\n\nQuestion: {question}\nAnswer:"
+
+        def format_row(row: dict) -> dict:
+            row = dict(row)
+
+            # Extract question
+            question_text = row.get("question", "") or ""
+
+            # Extract and format context
+            context_dict = row.get("context", {}) or {}
+            labels = context_dict.get("labels", []) or []
+            contexts = context_dict.get("contexts", []) or []
+
+            # Format contexts with their labels
+            formatted_contexts = []
+            for label, context in zip(labels, contexts):
+                formatted_contexts.append(f"{label}. {context}")
+            context_text = "\n".join(formatted_contexts)
+
+            # Build complete prompt
+            complete_prompt = prompt_template.format(
+                context=context_text, question=question_text
+            )
+
+            # Map final decision to letter (A/B/C)
+            final_decision = (row.get("final_decision", "") or "").lower()
+            answer = choices_map.get(final_decision, "")
+
+            # Keep full original example under 'info'
+            info = dict(row)
+
+            return {
+                "question": complete_prompt,
+                "answer": answer,
+                "info": info,
+            }
+
+        return dataset.map(format_row, load_from_cache_file=False)
diff --git a/src/med_reason_evals/verifiers/pubmedqa.py b/src/med_reason_evals/verifiers/pubmedqa.py
@@ -0,0 +1,131 @@
+"""PubMedQA Evaluation.
+
+Dataset: HuggingFace `qiaojin/PubMedQA` (pqa_labeled and pqa_artificial splits).
+
+- Parser: Extracts \\boxed{A|B|C} from completions (A=yes, B=no, C=maybe)
+- Reward Functions:
+    - Exact match classification reward
+"""
+
+import json
+import os
+
+import verifiers as vf
+from datasets import load_dataset
+from dotenv import load_dotenv
+from openai import OpenAI
+
+
+def map_row_to_mcq_prompt(row):
+    """Map PubMedQA row to MCQ-style prompt with A/B/C answers."""
+    question_text = row.get("question", "")
+    context_dict = row.get("context", {})
+    labels = context_dict.get("labels", [])
+    contexts = context_dict.get("contexts", [])
+    final_decision = row.get("final_decision", "").lower()
+
+    choices_map = {"yes": "A", "no": "B", "maybe": "C"}
+    correct_answer_letter = choices_map.get(final_decision, "C")  # default to maybe
+
+    formatted_contexts = [
+        f"{label}. {context}" for label, context in zip(labels, contexts)
+    ]
+    context_text = "\n".join(formatted_contexts)
+
+    complete_prompt = (
+        f"Answer A for yes, B for no or C for maybe.\n\n"
+        f"Context: {context_text}\n\n"
+        f"Question: {question_text}\nAnswer: "
+    )
+
+    return {
+        "question": complete_prompt,
+        "answer": correct_answer_letter,
+        "task": "pubmedqa",
+    }
+
+
+def classification_reward_func(prompt, completion, answer, state, **kwargs) -> float:
+    """Exact match reward: 1.0 if predicted letter matches ground truth."""
+    # Extract content from chat completion
+    if isinstance(completion, list) and len(completion) > 0:
+        content = completion[0].get("content", "")
+    else:
+        content = str(completion)
+
+    # Parse using the rubric's parser
+    parser = kwargs.get("parser")
+    if parser is None:
+        return 0.0
+
+    parsed = parser.parse(content)
+    predicted_letter = parsed.strip().rstrip(".") if parsed else None
+
+    return 1.0 if predicted_letter == answer else 0.0
+
+
+def main() -> None:
+    """Run evaluation on PubMedQA."""
+    load_dotenv()
+
+    # Load datasets
+    DATASET_PATH = "qiaojin/PubMedQA"
+    dataset_train = load_dataset(DATASET_PATH, name="pqa_artificial", split="train")
+    dataset_test = load_dataset(DATASET_PATH, name="pqa_labeled", split="train")
+
+    # Filter test set to human-annotated 500 examples
+    here = os.path.dirname(__file__)
+    file_path = os.path.join(here, "data", "test_ground_truth.json")
+    with open(file_path) as f:
+        test_ids = set(json.load(f))  # use set for O(1) lookup
+
+    dataset_test = dataset_test.filter(
+        lambda sample: str(sample["pubid"]) in test_ids, load_from_cache_file=False
+    )
+
+    # Map to standard format
+    mapped_train = dataset_train.map(
+        map_row_to_mcq_prompt, load_from_cache_file=False, keep_in_memory=True
+    )
+    mapped_test = dataset_test.map(
+        map_row_to_mcq_prompt, load_from_cache_file=False, keep_in_memory=True
+    )
+
+    # Use boxed-only system prompt (no chain-of-thought)
+    system_prompt = vf.utils.data_utils.BOXED_SYSTEM_PROMPT
+    parser = vf.parsers.parser.Parser(extract_fn=vf.extract_boxed_answer)
+
+    # Build rubric
+    rubric = vf.Rubric(
+        funcs=[classification_reward_func],
+        weights=[1.0],
+        parser=parser,
+    )
+
+    # Create environment
+    env = vf.SingleTurnEnv(
+        dataset=mapped_train,
+        eval_dataset=mapped_test,
+        system_prompt=system_prompt,
+        parser=parser,
+        rubric=rubric,
+    )
+
+    # Initialize client (Groq via OpenAI-compatible API)
+    client = OpenAI(
+        api_key=os.getenv("GROQ_API_KEY"),
+        base_url="https://api.groq.com/openai/v1",
+    )
+
+    # Run evaluation
+    results = env.evaluate(
+        client=client,
+        model="llama-3.3-70b-versatile",
+        num_examples=2,
+        rollouts_per_example=5,
+    )
+    print(results)
+
+
+if __name__ == "__main__":
+    main()