vllm-project
diff --git a/‎.github/workflows/build-and-publish-release-images.yaml‎
Lines changed: 0 additions & 111 deletions b/‎.github/workflows/build-and-publish-release-images.yaml‎
Lines changed: 0 additions & 111 deletions
diff --git a/‎.github/workflows/quality-check.yaml‎
Lines changed: 5 additions & 1 deletion b/‎.github/workflows/quality-check.yaml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎.github/workflows/ready-label-check.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/ready-label-check.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/stale.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/stale.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test-check-transformers.yaml‎
Lines changed: 10 additions & 10 deletions b/‎.github/workflows/test-check-transformers.yaml‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎.github/workflows/test-check.yaml‎
Lines changed: 6 additions & 2 deletions b/‎.github/workflows/test-check.yaml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎setup.py‎
Lines changed: 3 additions & 3 deletions b/‎setup.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/llmcompressor/modifiers/awq/base.py‎
Lines changed: 29 additions & 14 deletions b/‎src/llmcompressor/modifiers/awq/base.py‎
Lines changed: 29 additions & 14 deletions
@@ -15,14 +15,18 @@ env:
 
 jobs:
   quality-check:
-    runs-on: ubuntu-22.04
+    runs-on: gcp-k8s-vllm-util
     steps:
       - uses: actions/setup-python@v5
         with:
           python-version: '3.10'
       - uses: actions/checkout@v4
       - name: Install uv
         uses: astral-sh/setup-uv@v6
+      - name: Install make
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y make
       - name: "⚙️ Install dependencies"
         run: uv pip install .[dev]
       - name: "🧹 Running quality checks"
 
@@ -8,14 +8,15 @@ on:
       - unlabeled
       - opened
       - reopened
+      - synchronize
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
   ready-label-check:
-    runs-on: ubuntu-latest
+    runs-on: gcp-k8s-vllm-util
     steps:
       - name: Fail if ready label has not been applied to PR
         if: "!contains(github.event.pull_request.labels.*.name, 'ready')"
 
@@ -11,7 +11,7 @@ jobs:
       issues: write
       pull-requests: write
       actions: write
-    runs-on: ubuntu-latest
+    runs-on: gcp-k8s-vllm-util
     steps:
       - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d
         with:
 
@@ -24,7 +24,7 @@ concurrency:
 
 jobs:
   detect-changes:
-    runs-on: ubuntu-latest
+    runs-on: gcp-k8s-vllm-util
 
     outputs:
       matched-changes: ${{ steps.changed-files.outputs.all_changed_files }}
@@ -89,38 +89,38 @@ jobs:
       - name: "⚙️ Prepare code coverage"
         if: inputs.code_coverage
         uses: ./.github/actions/prepare-code-coverage
-      - name: "🔬 Running transformers tests"
+      - name: "🔬 Running compression tests"
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/compression
+          pytest -vra tests/llmcompressor/transformers/compression
       - name: Run Data Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/data
+          pytest -vra tests/llmcompressor/transformers/data
       - name: Running GPTQ Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/gptq
+          pytest -vra tests/llmcompressor/transformers/gptq
       - name: Running AutoRound Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/autoround
+          pytest -vra tests/llmcompressor/transformers/autoround
       - name: Running ONESHOT Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/oneshot
+          pytest -vra tests/llmcompressor/transformers/oneshot
       - name: Running SparseGPT Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/sparsegpt
+          pytest -vra tests/llmcompressor/transformers/sparsegpt
       - name: Running Tracing Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/tracing
+          pytest -vra tests/llmcompressor/transformers/tracing
       - name: Running KV Cache Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/kv_cache
+          pytest -vra tests/llmcompressor/transformers/kv_cache
       - name: "Upload coverage report"
         if: (success() || failure()) && inputs.code_coverage
         uses: actions/upload-artifact@v4
 
@@ -120,7 +120,7 @@ jobs:
         uses: ./.github/actions/prepare-code-coverage
       - name: "🔬 Running pytorch tests"
         run: |
-          pytest -v tests/llmcompressor/pytorch
+          pytest -vra tests/llmcompressor/pytorch
       - name: "Upload coverage report"
         if: (success() || failure()) && inputs.code_coverage
         uses: actions/upload-artifact@v4
@@ -138,7 +138,7 @@ jobs:
           coverage report --data-file="$COVERAGE_FILE" --skip-empty --format="markdown" > "$GITHUB_STEP_SUMMARY"
 
   combine-coverage:
-    runs-on: ubuntu-22.04
+    runs-on: gcp-k8s-vllm-util
     needs: [base-tests, pytorch-tests]
     if: (success() || failure()) && inputs.code_coverage
     steps:
@@ -155,6 +155,10 @@ jobs:
           python-version: '3.12'
       - name: Install uv
         uses: astral-sh/setup-uv@v6
+      - name: Install make
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y make
       - name: "Install dependencies"
         run: |
           uv pip install -U setuptools
 
@@ -121,7 +121,7 @@ def localversion_func(version: ScmVersion) -> str:
             else "requests>=2.32.2"
         ),
         ("tqdm>=4.66.3,<=4.67.1" if BUILD_TYPE == "release" else "tqdm>=4.66.3"),
-        ("torch>=2.9.0,<=2.9.1" if BUILD_TYPE == "release" else "torch>=2.9.0,<=2.9.1"),
+        ("torch>=2.9.0,<=2.10.0" if BUILD_TYPE == "release" else "torch>=2.9.0"),
         (
             "transformers>=4.56.1,<=4.57.6"
             if BUILD_TYPE == "release"
@@ -163,10 +163,10 @@ def localversion_func(version: ScmVersion) -> str:
             "cmarkgfm>=2024.1.14",
             "trl>=0.10.1",
             "pandas<2.3.0",
-            "torchvision<=0.24.1",
+            "torchvision",
             "librosa==0.11.0",
             "soundfile",
-            "torchcodec<=0.9.1",
+            "torchcodec",
             # linting, formatting, and type checking
             "mypy~=1.10.0",
             "ruff~=0.4.8",
 
@@ -76,6 +76,10 @@ class AWQModifier(Modifier, QuantizationMixin):
           balance_layers: ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
         - smooth_layer: "re:.*final_layer_norm"
           balance_layers: ["re:.*fc1"]
+        # activation_hook_target specifies which submodule of the parent to hook
+        # for activation caching.
+        # This change is only useful for MoE models with parallel transformer blocks,
+        # and one should use the default value (None) in most cases.
       ignore: ["lm_head"]
       config_groups:
         group_0:
@@ -122,6 +126,11 @@ class AWQModifier(Modifier, QuantizationMixin):
         to smoothed) and the second entry is the layer whose output is scaled to
         achieve the smoothing.
         If regex is used, it matches layers with the largest overlap in module name.
+        Each mapping may also include an ``activation_hook_target``: a dotted
+        attribute path relative to the parent module (lowest common ancestor)
+        specifying which submodule to hook for activation caching. This is useful
+        for parallel transformer blocks where the default (hooking
+        ``balance_layers[0]``) would capture the wrong activations.
     :param ignore: list of layers to ignore during quantization (not smoothed).
         It should match the name of layers whose outputs are scaled to achieve
         smoothing (the second entry of the mappings list).
@@ -389,6 +398,17 @@ def _set_resolved_mappings(self, model: Module) -> None:
                     balance_names, model, torch.nn.ModuleList
                 )
 
+                activation_hook_target = None
+                if mapping.activation_hook_target:
+                    activation_hook_target = getattr_chain(
+                        ancestor, mapping.activation_hook_target
+                    )
+                    if activation_hook_target is None:
+                        raise ValueError(
+                            f"activation_hook_target '{mapping.activation_hook_target}'"
+                            f" not found on parent module '{ancestor_name}'"
+                        )
+
                 resolved_mappings.append(
                     ResolvedMapping(
                         smooth_name,
@@ -397,6 +417,7 @@ def _set_resolved_mappings(self, model: Module) -> None:
                         balance_names=balance_names,
                         parent=ancestor,
                         parent_name=ancestor_name,
+                        activation_hook_target=activation_hook_target,
                     )
                 )
         self._resolved_mappings = resolved_mappings
@@ -468,16 +489,14 @@ def cache_smooth_activations_hook(
             # input activations to balance layers needed for loss function
             # storing inputs to first balance layer is sufficient
             # other balance layers get the same input
-
-            # The line below is useful for models that use parallel transformer block,
-            # such as gemma 3, command A. Need a better way to integrate it to the code.
-            # layer_to_hook = (
-            #     mapping.parent.mlp
-            #     if hasattr(mapping.parent, 'mlp')
-            #     else mapping.balance_layers[0]
-            # )
+            #
+            # For parallel transformer blocks (e.g. Command A, Gemma 3) the first
+            # balance layer may not receive the right activations.  When
+            # activation_hook_target is set on the mapping, hook that module
+            # instead of balance_layers[0].
+            layer_to_hook = mapping.activation_hook_target or mapping.balance_layers[0]
             self.register_hook(
-                mapping.balance_layers[0],
+                layer_to_hook,
                 create_cache_smooth_activations_hook_fn(mapping.smooth_name),
                 "forward",
             )
@@ -536,8 +555,6 @@ def _apply_smoothing(self, model: Module) -> None:
                 orig_layer_weights = {
                     balance_layer: balance_layer.weight.clone()
                     for balance_layer in mapping.balance_layers
-                    if hasattr(balance_layer, "quantization_scheme")
-                    and hasattr(balance_layer.quantization_scheme, "weights")
                 }
 
                 best_scales = self._compute_best_scale(
@@ -687,11 +704,9 @@ def _compute_best_scale(
                 else:
                     scales = x_mean.pow(ratio).clamp(min=1e-4).view(-1)
                 scales = scales / (scales.max() * scales.min()).sqrt()
-                _scalesview = scales.view(1, -1).to(device)
-
-                # avoid scaling values that overflow
                 scales[torch.isinf(scales)] = 1
                 scales[torch.isnan(scales)] = 1
+                _scalesview = scales.view(1, -1).to(device)
 
                 # Q(W * s)
                 for balance_layer in balance_layers_to_patch: