[Tests] Fix failing nightly quantization tests (#1744)

dsikka · kylesayrs · web-flow · commit d1b00c79cb94 · 2025-08-19T15:13:03.000-04:00
## Purpose ## * Fix failing tests introduced by adding `dispatch_for_generation` to the data free pipeline ## Changes ## * After oneshot, oneshot should remove any dispatches from the model. However, there is a bug fixed [here](neuralmagic/compressed-tensors#427) where models which fit entirely on one GPU do not have their dispatches removed (since they do not have hooks) * As a result, we need to move weights to the same device before comparing them for `test_quantization_reload` * The `test_perplexity` test was implicitly relying on the model being dispatched to GPUs. Now explicitly `dispatch_for_generation`, similar to how we do in our examples ## Testing ## * Nightly and commit tests passed locally --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/tests/llmcompressor/transformers/compression/test_quantization.py b/tests/llmcompressor/transformers/compression/test_quantization.py
@@ -14,6 +14,7 @@
 from llmcompressor.args import DatasetArguments
 from llmcompressor.pytorch.utils import tensors_to_device
 from llmcompressor.transformers.finetune.data import TextGenerationDataset
+from llmcompressor.utils.dev import dispatch_for_generation
 from tests.testing_utils import parse_params, requires_gpu
 
 CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/compression/configs"
@@ -37,7 +38,7 @@ def setUpClass(cls):
         cls.test_dir = tempfile.mkdtemp()
 
         cls.model = AutoModelForCausalLM.from_pretrained(
-            cls.model_stub, torch_dtype=cls.weight_dtype, device_map="cuda:0"
+            cls.model_stub, torch_dtype=cls.weight_dtype
         )
         model = cls._run_oneshot(
             cls.model,
@@ -99,18 +100,19 @@ def test_quantization_reload(self):
         model_reloaded = AutoModelForCausalLM.from_pretrained(
             os.path.join(self.test_dir, self.output),
             torch_dtype="auto",
-            device_map="cuda:0",
         )
 
         og_weights, og_inputs = self._get_quant_info(self.model)
         reloaded_weights, reloaded_inputs = self._get_quant_info(model_reloaded)
+        # TODO: can remove `to` calls after
+        # https://github.com/neuralmagic/compressed-tensors/pull/427
 
         for name, (o_scale, o_zp, o_weight) in og_weights.items():
             n_scale, n_zp, n_weight = reloaded_weights[name]
             assert o_scale.dtype == n_scale.dtype == self.weight_dtype
-            assert torch.equal(o_scale, n_scale)
+            assert torch.equal(o_scale, n_scale.to(o_scale.device))
             assert o_zp.dtype == n_zp.dtype
-            assert torch.equal(o_zp, n_zp)
+            assert torch.equal(o_zp, n_zp.to(o_zp.device))
 
             # we don't expect an exact match here because o_weight still has the
             # original weight and n_weight has been fake_quantized
@@ -119,9 +121,9 @@ def test_quantization_reload(self):
         for name, (o_scale, o_zp) in og_inputs.items():
             n_scale, n_zp = reloaded_inputs[name]
             assert o_scale.dtype == n_scale.dtype == self.weight_dtype
-            assert torch.equal(o_scale, n_scale)
+            assert torch.equal(o_scale, n_scale.to(o_scale.device))
             assert o_zp.dtype == n_zp.dtype
-            assert torch.equal(o_zp, n_zp)
+            assert torch.equal(o_zp, n_zp.to(o_zp.device))
 
     def _get_dataloader(self, dataset_args, tokenizer):
         dataset_manager = TextGenerationDataset.load_from_registry(
@@ -150,6 +152,7 @@ def test_perplexity(self):
             max_seq_length=self.max_seq_length,
         )
         dataloader = self._get_dataloader(dataset_args, tokenizer)
+        dispatch_for_generation(self.model)
 
         total_ppl = 0.0
         total_non_nan = 0
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -13,6 +13,8 @@
 
 from tests.data import CustomTestConfig, TestConfig
 
+TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", None)
+
 
 # TODO: probably makes sense to move this type of function to a more central place,
 # which can be used by __init__.py as well
@@ -78,6 +80,10 @@ def _parse_configs_dir(current_config_dir):
 
         for file in os.listdir(current_config_dir):
             config_path = os.path.join(current_config_dir, file)
+            if TEST_DATA_FILE is not None:
+                if not config_path.endswith(TEST_DATA_FILE):
+                    continue
+
             config = _load_yaml(config_path)
             if not config:
                 continue