diff --git a/tests/lmeval/configs/w4a16_awq_sym.yaml b/tests/lmeval/configs/w4a16_awq_sym.yaml index 42a1fabcce..666b58b168 100644 --- a/tests/lmeval/configs/w4a16_awq_sym.yaml +++ b/tests/lmeval/configs/w4a16_awq_sym.yaml @@ -4,10 +4,11 @@ scheme: W4A16_awq_sym recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft +seed: 100 lmeval: recovery_threshold: - exact_match,strict-match: 0.91 - exact_match,flexible-extract: 0.91 + exact_match,strict-match: 0.92 + exact_match,flexible-extract: 0.92 metrics: exact_match,flexible-extract: 0.70 exact_match,strict-match: 0.70 diff --git a/tests/lmeval/configs/w4a4_nvfp4.yaml b/tests/lmeval/configs/w4a4_nvfp4.yaml index 8862b54571..39fc3468df 100644 --- a/tests/lmeval/configs/w4a4_nvfp4.yaml +++ b/tests/lmeval/configs/w4a4_nvfp4.yaml @@ -3,6 +3,7 @@ model: meta-llama/Llama-3.1-8B-Instruct scheme: NVFP4 dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft +seed: 100 num_calibration_samples: 20 lmeval: # NVFP4 (4-bit weights + 4-bit activations) has lower recovery than FP8/INT8 diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py index 987f925bd5..3a28d15374 100644 --- a/tests/lmeval/test_lmeval.py +++ b/tests/lmeval/test_lmeval.py @@ -100,12 +100,18 @@ def set_up(self, test_data_file: str): self.recipe = eval_config.get("recipe") self.quant_type = eval_config.get("quant_type") self.save_dir = eval_config.get("save_dir") - self.seed = eval_config.get("seed", None) + self.seed = eval_config.get("seed", 42) - if self.seed is not None: - random.seed(self.seed) - numpy.random.seed(self.seed) - torch.manual_seed(self.seed) + random.seed(self.seed) + numpy.random.seed(self.seed) + torch.manual_seed(self.seed) + + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(self.seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + logger.info(f"Seed set to {self.seed} with deterministic mode enabled") logger.info("========== RUNNING ==============") logger.info(self.scheme) @@ -187,6 +193,11 @@ def _eval_model(self, model: str) -> dict: limit=self.lmeval.limit, apply_chat_template=self.lmeval.apply_chat_template, batch_size=self.lmeval.batch_size, + # Pass seeds to lm_eval for deterministic evaluation + random_seed=self.seed, + numpy_random_seed=self.seed, + torch_random_seed=self.seed, + fewshot_random_seed=self.seed, ) return results