Merge remote-tracking branch 'origin' into kylesayrs/autowrap-support-gemma3n

kylesayrs · kylesayrs · commit 96e517ffc568 · 2025-07-31T11:08:29.000-04:00
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml
@@ -0,0 +1,47 @@
+name: 🐛 Bug report
+description: Raise an issue here if you find a bug.
+labels: bug
+title: "[Bug]: "
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/llm-compressor/issues?q=is%3Aissue+sort%3Acreated-desc+).
+
+      #### ⚠️ For any issues related vLLM which are not related to quantization or compressed models, please create an issue in [vllm-project/vllm](https://github.com/vllm-project/vllm/issues).
+- type: textarea
+  attributes:
+    label: ⚙️ Your current environment
+    description: |
+      Please run the following and paste the output below.
+      ```bash
+      wget https://raw.githubusercontent.com/vllm-project/llm-compressor/main/tools/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+    value: |
+      <details>
+      <summary>The output of <code>python collect_env.py</code></summary>
+
+      ```text
+      Your output of `python collect_env.py` here
+      ```
+
+      </details>
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 🐛 Describe the bug
+    description: |
+      Please provide a clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 🛠️ Steps to reproduce
+    description: |
+      If applicable, please describe any steps required to reproduce. If you can share an applicable huggingface model stub, please do so here.
+  validations:
+    required: false
diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
@@ -10,7 +10,7 @@
     ModelCompressor,
     SparsityCompressionConfig,
     delete_offload_parameter,
-    is_module_offloaded,
+    has_offloaded_params,
     register_offload_parameter,
 )
 from loguru import logger
@@ -138,7 +138,7 @@ def untie_word_embeddings(model: PreTrainedModel):
             continue
 
         # this could be replaced by a `get_offloaded_parameter` util
-        if not is_module_offloaded(module):
+        if not has_offloaded_params(module):
             untied_data = module.weight.data.clone()
         else:
             untied_data = module._hf_hook.weights_map["weight"].clone()
diff --git a/tests/llmcompressor/transformers/gptq/test_oneshot.py b/tests/llmcompressor/transformers/gptq/test_oneshot.py
@@ -20,14 +20,15 @@
                         type: "int"
                         symmetric: true
                         strategy: "channel"
-                    targets: ["Linear"]
+                    targets: ["re:.*model.layers.2.self_attn.q_proj$"]
 """
 
 recipe_modifier_full = GPTQModifier(
     ignore=["lm_head"],
     config_groups={
         "group_0": QuantizationScheme(
-            targets=["Linear"], weights=QuantizationArgs(num_bits=4, strategy="channel")
+            targets=["re:.*model.layers.2.self_attn.q_proj$"],
+            weights=QuantizationArgs(num_bits=4, strategy="channel"),
         )
     },
 )
@@ -36,18 +37,18 @@
     ignore=["lm_head"],
     config_groups={
         "group_0": QuantizationScheme(
-            targets=["Linear"],
+            targets=["re:.*model.layers.2.self_attn.q_proj$"],
             weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128),
         )
     },
 )
 
 recipe_modifier_shorthand_a = GPTQModifier(
-    ignore=["lm_head"], targets="Linear", scheme="W4A16"
+    ignore=["lm_head"], targets="re:.*model.layers.2.self_attn.q_proj$", scheme="W4A16"
 )
 
 recipe_modifier_shorthand_b = GPTQModifier(
-    ignore=["lm_head"], scheme={"W4A16": ["Linear"]}
+    ignore=["lm_head"], scheme={"W4A16": ["re:.*model.layers.2.self_attn.q_proj$"]}
 )
 
 
@@ -65,7 +66,7 @@ def setUp(self):
         import torch
 
         self.output = "./oneshot_output"
-        self.model = "Xenova/llama2.c-stories110M"
+        self.model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
         self.dataset = "open_platypus"
         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
@@ -95,17 +96,17 @@ def test_oneshot_application(self):
         assert quantization_config is not None
 
         # check config is set properly
-        assert quantization_config.ignore == ["lm_head"]
+        assert "lm_head" in quantization_config.ignore
         assert len(quantization_config.config_groups) == 1
         quant_scheme = quantization_config.config_groups["group_0"]
         assert isinstance(quant_scheme, QuantizationScheme)
-        assert quant_scheme.targets == ["Linear"]
+        assert quant_scheme.targets == ["re:.*model.layers.2.self_attn.q_proj$"]
         weight_args = quantization_config.config_groups["group_0"].weights
         assert isinstance(weight_args, QuantizationArgs)
         assert weight_args.num_bits == 4
 
         # Check a specific layer is quantized
-        targetted_linear_layer = model_loaded.model.layers[0].self_attn.k_proj
+        targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj
         assert hasattr(targetted_linear_layer, "quantization_scheme")
 
         # Check lm-head is not quantized
diff --git a/tools/collect_env.py b/tools/collect_env.py
@@ -0,0 +1,51 @@
+"""
+Script used to generate environment information for the purpose of
+creating bug reports. See `.github/ISSUE_TEMPLATE/bug_report.md`
+"""
+
+import platform
+import sys
+import importlib
+
+def get_version(pkg_name):
+    try:
+        return importlib.metadata.version(pkg_name)
+    except importlib.metadata.PackageNotFoundError:
+        return "None"
+
+def get_torch_hardware_info():
+    try:
+        import torch
+        cuda_devices = []
+        amd_devices = []
+        if torch.cuda.is_available():
+            for i in range(torch.cuda.device_count()):
+                name = torch.cuda.get_device_name(i)
+                if "AMD" in name.upper():
+                    amd_devices.append(name)
+                else:
+                    cuda_devices.append(name)
+        return cuda_devices, amd_devices
+    except ImportError:
+        return [], []
+
+def collect_environment_info():
+    cuda_devices, amd_devices = get_torch_hardware_info()
+
+    info = {
+        "Operating System": platform.platform(),
+        "Python Version": sys.version.replace("\n", " "),
+        "llm-compressor Version": get_version("llmcompressor"),
+        "compressed-tensors Version": get_version("compressed_tensors"),
+        "transformers Version": get_version("transformers"),
+        "torch Version": get_version("torch"),
+        "CUDA Devices": cuda_devices if cuda_devices else "None",
+        "AMD Devices": amd_devices if amd_devices else "None",
+    }
+
+    print("### Environment Information ###")
+    for key, value in info.items():
+        print(f"{key}: `{value}`")
+
+if __name__ == "__main__":
+    collect_environment_info()