|
14 | 14 |
|
15 | 15 | import os |
16 | 16 | import sys |
| 17 | +import tempfile |
17 | 18 | from collections import defaultdict |
18 | 19 |
|
19 | 20 | import pytest |
|
25 | 26 | from examples.run_grpo_math import hf_data_processor |
26 | 27 | from nemo_rl.algorithms.utils import get_tokenizer |
27 | 28 | from nemo_rl.data.datasets import AllTaskProcessedDataset |
| 29 | +from nemo_rl.data.eval_datasets import ( |
| 30 | + AIME2024Dataset, |
| 31 | + AIME2025Dataset, |
| 32 | + GPQADataset, |
| 33 | + MathDataset, |
| 34 | + MMLUDataset, |
| 35 | +) |
28 | 36 | from nemo_rl.data.hf_datasets.deepscaler import DeepScalerDataset |
29 | 37 | from nemo_rl.data.hf_datasets.openmathinstruct2 import OpenMathInstruct2Dataset |
30 | 38 | from nemo_rl.data.interfaces import TaskDataProcessFnCallable, TaskDataSpec |
@@ -78,18 +86,15 @@ def test_math_data_processor(): |
78 | 86 | ], |
79 | 87 | ) |
80 | 88 | @pytest.mark.parametrize( |
81 | | - "dataset_name", |
| 89 | + "dataset_cls", |
82 | 90 | [ |
83 | | - "openmathinstruct2", |
84 | | - "deepscaler", |
| 91 | + OpenMathInstruct2Dataset, |
| 92 | + DeepScalerDataset, |
85 | 93 | ], |
86 | 94 | ) |
87 | | -def test_math_hf_data_processor(tokenizer_name, dataset_name): |
| 95 | +def test_math_hf_data_processor(tokenizer_name, dataset_cls): |
88 | 96 | # Initialize dataset |
89 | | - if dataset_name == "openmathinstruct2": |
90 | | - data = OpenMathInstruct2Dataset() |
91 | | - elif dataset_name == "deepscaler": |
92 | | - data = DeepScalerDataset() |
| 97 | + data = dataset_cls() |
93 | 98 |
|
94 | 99 | # Setup tokenizer |
95 | 100 | tokenizer = get_tokenizer( |
@@ -124,3 +129,70 @@ def test_math_hf_data_processor(tokenizer_name, dataset_name): |
124 | 129 | assert first_item is not None |
125 | 130 | assert "message_log" in first_item |
126 | 131 | assert len(first_item["message_log"]) > 0 |
| 132 | + |
| 133 | + |
| 134 | +@pytest.fixture |
| 135 | +def system_prompt_file(request): |
| 136 | + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as file: |
| 137 | + file.write("You are a helpful assistant.\n{}") |
| 138 | + |
| 139 | + return file.name |
| 140 | + |
| 141 | + |
| 142 | +@pytest.mark.hf_gated |
| 143 | +@pytest.mark.parametrize( |
| 144 | + "tokenizer_name", |
| 145 | + [ |
| 146 | + "meta-llama/Llama-3.2-1B-Instruct", |
| 147 | + "Qwen/Qwen2.5-1.5B-Instruct", # no bos token |
| 148 | + "google/gemma-3-1b-it", |
| 149 | + "Qwen/Qwen3-0.6B", # no bos token |
| 150 | + "deepseek-ai/DeepSeek-V3", |
| 151 | + "moonshotai/Moonlight-16B-A3B-Instruct", |
| 152 | + ], |
| 153 | +) |
| 154 | +@pytest.mark.parametrize( |
| 155 | + "dataset_cls", |
| 156 | + [ |
| 157 | + MMLUDataset, |
| 158 | + GPQADataset, |
| 159 | + MathDataset, |
| 160 | + AIME2024Dataset, |
| 161 | + AIME2025Dataset, |
| 162 | + ], |
| 163 | +) |
| 164 | +@pytest.mark.parametrize( |
| 165 | + "system_prompt_file", [system_prompt_file, None], indirect=True |
| 166 | +) |
| 167 | +def test_eval_math_hf_data_processor(tokenizer_name, dataset_cls, system_prompt_file): |
| 168 | + # Initialize dataset |
| 169 | + data = dataset_cls() |
| 170 | + |
| 171 | + # Setup tokenizer |
| 172 | + tokenizer = get_tokenizer( |
| 173 | + TokenizerConfig( |
| 174 | + name=tokenizer_name, |
| 175 | + chat_template="default", |
| 176 | + ) |
| 177 | + ) |
| 178 | + |
| 179 | + # Configure task specification |
| 180 | + math_task_spec = TaskDataSpec( |
| 181 | + task_name="math", |
| 182 | + prompt_file=f"{os.path.dirname(abspath)}/../../../examples/prompts/cot.txt", |
| 183 | + system_prompt_file=system_prompt_file, |
| 184 | + ) |
| 185 | + |
| 186 | + dataset = AllTaskProcessedDataset( |
| 187 | + dataset=data.rekeyed_ds, |
| 188 | + tokenizer=tokenizer, |
| 189 | + default_task_data_spec=math_task_spec, |
| 190 | + task_data_processors=data.processor, |
| 191 | + max_seq_length=128, |
| 192 | + ) |
| 193 | + |
| 194 | + # Test that the first item can be retrieved when the BOS token assertion passes |
| 195 | + first_item = dataset[0] |
| 196 | + assert first_item is not None |
| 197 | + assert "message_log" in first_item |
| 198 | + assert len(first_item["message_log"]) > 0 |
0 commit comments