Skip to content

Commit f0a1824

Browse files
authored
[Tests][LM Eval] Fix test seeding for consistent results (#2395)
SUMMARY: - Enables consistent test results before runs Test Run: https://github.com/neuralmagic/llm-compressor-testing/actions/runs/22371360237 --------- Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
1 parent b0cc7a0 commit f0a1824

File tree

3 files changed

+20
-7
lines changed

3 files changed

+20
-7
lines changed

tests/lmeval/configs/w4a16_awq_sym.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@ scheme: W4A16_awq_sym
44
recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml
55
dataset_id: HuggingFaceH4/ultrachat_200k
66
dataset_split: train_sft
7+
seed: 100
78
lmeval:
89
recovery_threshold:
9-
exact_match,strict-match: 0.91
10-
exact_match,flexible-extract: 0.91
10+
exact_match,strict-match: 0.92
11+
exact_match,flexible-extract: 0.92
1112
metrics:
1213
exact_match,flexible-extract: 0.70
1314
exact_match,strict-match: 0.70

tests/lmeval/configs/w4a4_nvfp4.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ model: meta-llama/Llama-3.1-8B-Instruct
33
scheme: NVFP4
44
dataset_id: HuggingFaceH4/ultrachat_200k
55
dataset_split: train_sft
6+
seed: 100
67
num_calibration_samples: 20
78
lmeval:
89
# NVFP4 (4-bit weights + 4-bit activations) has lower recovery than FP8/INT8

tests/lmeval/test_lmeval.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,18 @@ def set_up(self, test_data_file: str):
100100
self.recipe = eval_config.get("recipe")
101101
self.quant_type = eval_config.get("quant_type")
102102
self.save_dir = eval_config.get("save_dir")
103-
self.seed = eval_config.get("seed", None)
103+
self.seed = eval_config.get("seed", 42)
104104

105-
if self.seed is not None:
106-
random.seed(self.seed)
107-
numpy.random.seed(self.seed)
108-
torch.manual_seed(self.seed)
105+
random.seed(self.seed)
106+
numpy.random.seed(self.seed)
107+
torch.manual_seed(self.seed)
108+
109+
if torch.cuda.is_available():
110+
torch.cuda.manual_seed_all(self.seed)
111+
torch.backends.cudnn.deterministic = True
112+
torch.backends.cudnn.benchmark = False
113+
114+
logger.info(f"Seed set to {self.seed} with deterministic mode enabled")
109115

110116
logger.info("========== RUNNING ==============")
111117
logger.info(self.scheme)
@@ -187,6 +193,11 @@ def _eval_model(self, model: str) -> dict:
187193
limit=self.lmeval.limit,
188194
apply_chat_template=self.lmeval.apply_chat_template,
189195
batch_size=self.lmeval.batch_size,
196+
# Pass seeds to lm_eval for deterministic evaluation
197+
random_seed=self.seed,
198+
numpy_random_seed=self.seed,
199+
torch_random_seed=self.seed,
200+
fewshot_random_seed=self.seed,
190201
)
191202

192203
return results

0 commit comments

Comments
 (0)