[Misc] add w8a8 asym models (#11075)

dsikka · web-flow · commit 8cef6e02dcba · 2024-12-23T13:33:20.000-05:00
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
@@ -79,19 +79,23 @@ def zp_valid(zp: Optional[torch.Tensor]):
         assert output
 
 
-@pytest.mark.parametrize(
-    "model_path",
-    [
-        "neuralmagic/Llama-3.2-1B-quantized.w8a8"
-        # TODO static & asymmetric
-    ])
+@pytest.mark.parametrize("model_path", [
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
+])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
                                           example_prompts, model_path,
                                           max_tokens, num_logprobs):
     dtype = "bfloat16"
 
+    # skip language translation prompt for the static per tensor asym model
+    if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym":  # noqa: E501
+        example_prompts = example_prompts[0:-1]
+
     with hf_runner(model_path, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)