From 370bcdfc3345f8ecb1344205b32727727df65366 Mon Sep 17 00:00:00 2001 From: Dipika Date: Mon, 23 Feb 2026 22:52:28 +0000 Subject: [PATCH 1/6] add seeding --- tests/lmeval/test_lmeval.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py index 987f925bd5..f71e769dd2 100644 --- a/tests/lmeval/test_lmeval.py +++ b/tests/lmeval/test_lmeval.py @@ -100,13 +100,30 @@ def set_up(self, test_data_file: str): self.recipe = eval_config.get("recipe") self.quant_type = eval_config.get("quant_type") self.save_dir = eval_config.get("save_dir") - self.seed = eval_config.get("seed", None) + self.seed = eval_config.get("seed", 42) if self.seed is not None: random.seed(self.seed) numpy.random.seed(self.seed) torch.manual_seed(self.seed) + # Enhanced GPU/CUDA determinism + if torch.cuda.is_available(): + torch.cuda.manual_seed(self.seed) + torch.cuda.manual_seed_all(self.seed) + + # Enable deterministic operations + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + # Set CUDA workspace config for deterministic algorithms + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + + # Enable deterministic algorithms (warn instead of error for unsupported ops) + torch.use_deterministic_algorithms(True, warn_only=True) + + logger.info(f"Seed set to {self.seed} with deterministic mode enabled") + logger.info("========== RUNNING ==============") logger.info(self.scheme) logger.info( From 22b29bc144ddc4612740a2a261c4cda197380b1d Mon Sep 17 00:00:00 2001 From: Dipika Date: Tue, 24 Feb 2026 04:04:43 +0000 Subject: [PATCH 2/6] try again --- tests/lmeval/test_lmeval.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py index f71e769dd2..3a28d15374 100644 --- a/tests/lmeval/test_lmeval.py +++ b/tests/lmeval/test_lmeval.py @@ -102,27 +102,16 @@ def set_up(self, test_data_file: str): self.save_dir = eval_config.get("save_dir") self.seed = eval_config.get("seed", 42) - if self.seed is not None: - random.seed(self.seed) - numpy.random.seed(self.seed) - torch.manual_seed(self.seed) + random.seed(self.seed) + numpy.random.seed(self.seed) + torch.manual_seed(self.seed) - # Enhanced GPU/CUDA determinism - if torch.cuda.is_available(): - torch.cuda.manual_seed(self.seed) - torch.cuda.manual_seed_all(self.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(self.seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False - # Enable deterministic operations - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - - # Set CUDA workspace config for deterministic algorithms - os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' - - # Enable deterministic algorithms (warn instead of error for unsupported ops) - torch.use_deterministic_algorithms(True, warn_only=True) - - logger.info(f"Seed set to {self.seed} with deterministic mode enabled") + logger.info(f"Seed set to {self.seed} with deterministic mode enabled") logger.info("========== RUNNING ==============") logger.info(self.scheme) @@ -204,6 +193,11 @@ def _eval_model(self, model: str) -> dict: limit=self.lmeval.limit, apply_chat_template=self.lmeval.apply_chat_template, batch_size=self.lmeval.batch_size, + # Pass seeds to lm_eval for deterministic evaluation + random_seed=self.seed, + numpy_random_seed=self.seed, + torch_random_seed=self.seed, + fewshot_random_seed=self.seed, ) return results From 43fe84bba124c8e6aef801cb8a49de56fd744b13 Mon Sep 17 00:00:00 2001 From: Dipika Date: Tue, 24 Feb 2026 16:10:02 +0000 Subject: [PATCH 3/6] update --- tests/lmeval/configs/w4a16_awq_sym.yaml | 8 ++++---- tests/lmeval/configs/w4a4_nvfp4.yaml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/lmeval/configs/w4a16_awq_sym.yaml b/tests/lmeval/configs/w4a16_awq_sym.yaml index 42a1fabcce..54b9af4f17 100644 --- a/tests/lmeval/configs/w4a16_awq_sym.yaml +++ b/tests/lmeval/configs/w4a16_awq_sym.yaml @@ -6,8 +6,8 @@ dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft lmeval: recovery_threshold: - exact_match,strict-match: 0.91 - exact_match,flexible-extract: 0.91 + exact_match,strict-match: 0.90 + exact_match,flexible-extract: 0.90 metrics: - exact_match,flexible-extract: 0.70 - exact_match,strict-match: 0.70 + exact_match,flexible-extract: 0.6970 + exact_match,strict-match: 0.6990 diff --git a/tests/lmeval/configs/w4a4_nvfp4.yaml b/tests/lmeval/configs/w4a4_nvfp4.yaml index 8862b54571..9fa3442892 100644 --- a/tests/lmeval/configs/w4a4_nvfp4.yaml +++ b/tests/lmeval/configs/w4a4_nvfp4.yaml @@ -8,9 +8,9 @@ lmeval: # NVFP4 (4-bit weights + 4-bit activations) has lower recovery than FP8/INT8 # Observed: strict-match ~92.81%, flexible-extract ~89.59% recovery_threshold: - exact_match,strict-match: 0.92 + exact_match,strict-match: 0.91 exact_match,flexible-extract: 0.89 # Absolute metrics for warnings only metrics: exact_match,flexible-extract: 0.70 - exact_match,strict-match: 0.65 + exact_match,strict-match: 0.6430 From e461b885b93780da14ba763129b0f221c9686252 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 24 Feb 2026 14:42:37 -0500 Subject: [PATCH 4/6] Update flexible-extract metric value in YAML config Signed-off-by: Dipika Sikka --- tests/lmeval/configs/w4a4_nvfp4.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lmeval/configs/w4a4_nvfp4.yaml b/tests/lmeval/configs/w4a4_nvfp4.yaml index 9fa3442892..953082a263 100644 --- a/tests/lmeval/configs/w4a4_nvfp4.yaml +++ b/tests/lmeval/configs/w4a4_nvfp4.yaml @@ -12,5 +12,5 @@ lmeval: exact_match,flexible-extract: 0.89 # Absolute metrics for warnings only metrics: - exact_match,flexible-extract: 0.70 + exact_match,flexible-extract: 0.6830 exact_match,strict-match: 0.6430 From 8030d1b3ae1631bb395c4ef5e843a4c2bf4d8e0f Mon Sep 17 00:00:00 2001 From: Dipika Date: Tue, 24 Feb 2026 21:34:05 +0000 Subject: [PATCH 5/6] update seeds --- tests/lmeval/configs/w4a16_awq_sym.yaml | 9 +++++---- tests/lmeval/configs/w4a4_nvfp4.yaml | 7 ++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/lmeval/configs/w4a16_awq_sym.yaml b/tests/lmeval/configs/w4a16_awq_sym.yaml index 54b9af4f17..09def3b907 100644 --- a/tests/lmeval/configs/w4a16_awq_sym.yaml +++ b/tests/lmeval/configs/w4a16_awq_sym.yaml @@ -4,10 +4,11 @@ scheme: W4A16_awq_sym recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft +seed: 100 lmeval: recovery_threshold: - exact_match,strict-match: 0.90 - exact_match,flexible-extract: 0.90 + exact_match,strict-match: 0.91 + exact_match,flexible-extract: 0.91 metrics: - exact_match,flexible-extract: 0.6970 - exact_match,strict-match: 0.6990 + exact_match,flexible-extract: 0.70 + exact_match,strict-match: 0.70 diff --git a/tests/lmeval/configs/w4a4_nvfp4.yaml b/tests/lmeval/configs/w4a4_nvfp4.yaml index 953082a263..39fc3468df 100644 --- a/tests/lmeval/configs/w4a4_nvfp4.yaml +++ b/tests/lmeval/configs/w4a4_nvfp4.yaml @@ -3,14 +3,15 @@ model: meta-llama/Llama-3.1-8B-Instruct scheme: NVFP4 dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft +seed: 100 num_calibration_samples: 20 lmeval: # NVFP4 (4-bit weights + 4-bit activations) has lower recovery than FP8/INT8 # Observed: strict-match ~92.81%, flexible-extract ~89.59% recovery_threshold: - exact_match,strict-match: 0.91 + exact_match,strict-match: 0.92 exact_match,flexible-extract: 0.89 # Absolute metrics for warnings only metrics: - exact_match,flexible-extract: 0.6830 - exact_match,strict-match: 0.6430 + exact_match,flexible-extract: 0.70 + exact_match,strict-match: 0.65 From e49d94c8744f56885a93ab1619ff363c01e61b4a Mon Sep 17 00:00:00 2001 From: Dipika Date: Tue, 24 Feb 2026 21:40:03 +0000 Subject: [PATCH 6/6] increase threshold --- tests/lmeval/configs/w4a16_awq_sym.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/lmeval/configs/w4a16_awq_sym.yaml b/tests/lmeval/configs/w4a16_awq_sym.yaml index 09def3b907..666b58b168 100644 --- a/tests/lmeval/configs/w4a16_awq_sym.yaml +++ b/tests/lmeval/configs/w4a16_awq_sym.yaml @@ -7,8 +7,8 @@ dataset_split: train_sft seed: 100 lmeval: recovery_threshold: - exact_match,strict-match: 0.91 - exact_match,flexible-extract: 0.91 + exact_match,strict-match: 0.92 + exact_match,flexible-extract: 0.92 metrics: exact_match,flexible-extract: 0.70 exact_match,strict-match: 0.70