Skip to content

Commit 61ada1f

Browse files
author
Avishek Goswami
committed
Revert unrelated files to upstream main (PR shows only group-size divisibility changes)
Signed-off-by: Avishek Goswami <avishek.goswami@ibm.com>
1 parent 43594f0 commit 61ada1f

File tree

13 files changed

+112
-159
lines changed

13 files changed

+112
-159
lines changed

.github/workflows/build-and-publish-release-images.yaml

Lines changed: 0 additions & 111 deletions
This file was deleted.

.github/workflows/quality-check.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,18 @@ env:
1515

1616
jobs:
1717
quality-check:
18-
runs-on: ubuntu-22.04
18+
runs-on: gcp-k8s-vllm-util
1919
steps:
2020
- uses: actions/setup-python@v5
2121
with:
2222
python-version: '3.10'
2323
- uses: actions/checkout@v4
2424
- name: Install uv
2525
uses: astral-sh/setup-uv@v6
26+
- name: Install make
27+
run: |
28+
sudo apt-get update
29+
sudo apt-get install -y make
2630
- name: "⚙️ Install dependencies"
2731
run: uv pip install .[dev]
2832
- name: "🧹 Running quality checks"

.github/workflows/ready-label-check.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,15 @@ on:
88
- unlabeled
99
- opened
1010
- reopened
11+
- synchronize
1112

1213
concurrency:
1314
group: ${{ github.workflow }}-${{ github.ref }}
1415
cancel-in-progress: true
1516

1617
jobs:
1718
ready-label-check:
18-
runs-on: ubuntu-latest
19+
runs-on: gcp-k8s-vllm-util
1920
steps:
2021
- name: Fail if ready label has not been applied to PR
2122
if: "!contains(github.event.pull_request.labels.*.name, 'ready')"

.github/workflows/stale.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
issues: write
1212
pull-requests: write
1313
actions: write
14-
runs-on: ubuntu-latest
14+
runs-on: gcp-k8s-vllm-util
1515
steps:
1616
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d
1717
with:

.github/workflows/test-check-transformers.yaml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ concurrency:
2424

2525
jobs:
2626
detect-changes:
27-
runs-on: ubuntu-latest
27+
runs-on: gcp-k8s-vllm-util
2828

2929
outputs:
3030
matched-changes: ${{ steps.changed-files.outputs.all_changed_files }}
@@ -89,38 +89,38 @@ jobs:
8989
- name: "⚙️ Prepare code coverage"
9090
if: inputs.code_coverage
9191
uses: ./.github/actions/prepare-code-coverage
92-
- name: "🔬 Running transformers tests"
92+
- name: "🔬 Running compression tests"
9393
if: (success() || failure()) && steps.install.outcome == 'success'
9494
run: |
95-
pytest -v tests/llmcompressor/transformers/compression
95+
pytest -vra tests/llmcompressor/transformers/compression
9696
- name: Run Data Tests
9797
if: (success() || failure()) && steps.install.outcome == 'success'
9898
run: |
99-
pytest -v tests/llmcompressor/transformers/data
99+
pytest -vra tests/llmcompressor/transformers/data
100100
- name: Running GPTQ Tests
101101
if: (success() || failure()) && steps.install.outcome == 'success'
102102
run: |
103-
pytest -v tests/llmcompressor/transformers/gptq
103+
pytest -vra tests/llmcompressor/transformers/gptq
104104
- name: Running AutoRound Tests
105105
if: (success() || failure()) && steps.install.outcome == 'success'
106106
run: |
107-
pytest -v tests/llmcompressor/transformers/autoround
107+
pytest -vra tests/llmcompressor/transformers/autoround
108108
- name: Running ONESHOT Tests
109109
if: (success() || failure()) && steps.install.outcome == 'success'
110110
run: |
111-
pytest -v tests/llmcompressor/transformers/oneshot
111+
pytest -vra tests/llmcompressor/transformers/oneshot
112112
- name: Running SparseGPT Tests
113113
if: (success() || failure()) && steps.install.outcome == 'success'
114114
run: |
115-
pytest -v tests/llmcompressor/transformers/sparsegpt
115+
pytest -vra tests/llmcompressor/transformers/sparsegpt
116116
- name: Running Tracing Tests
117117
if: (success() || failure()) && steps.install.outcome == 'success'
118118
run: |
119-
pytest -v tests/llmcompressor/transformers/tracing
119+
pytest -vra tests/llmcompressor/transformers/tracing
120120
- name: Running KV Cache Tests
121121
if: (success() || failure()) && steps.install.outcome == 'success'
122122
run: |
123-
pytest -v tests/llmcompressor/transformers/kv_cache
123+
pytest -vra tests/llmcompressor/transformers/kv_cache
124124
- name: "Upload coverage report"
125125
if: (success() || failure()) && inputs.code_coverage
126126
uses: actions/upload-artifact@v4

.github/workflows/test-check.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ jobs:
120120
uses: ./.github/actions/prepare-code-coverage
121121
- name: "🔬 Running pytorch tests"
122122
run: |
123-
pytest -v tests/llmcompressor/pytorch
123+
pytest -vra tests/llmcompressor/pytorch
124124
- name: "Upload coverage report"
125125
if: (success() || failure()) && inputs.code_coverage
126126
uses: actions/upload-artifact@v4
@@ -138,7 +138,7 @@ jobs:
138138
coverage report --data-file="$COVERAGE_FILE" --skip-empty --format="markdown" > "$GITHUB_STEP_SUMMARY"
139139
140140
combine-coverage:
141-
runs-on: ubuntu-22.04
141+
runs-on: gcp-k8s-vllm-util
142142
needs: [base-tests, pytorch-tests]
143143
if: (success() || failure()) && inputs.code_coverage
144144
steps:
@@ -155,6 +155,10 @@ jobs:
155155
python-version: '3.12'
156156
- name: Install uv
157157
uses: astral-sh/setup-uv@v6
158+
- name: Install make
159+
run: |
160+
sudo apt-get update
161+
sudo apt-get install -y make
158162
- name: "Install dependencies"
159163
run: |
160164
uv pip install -U setuptools

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def localversion_func(version: ScmVersion) -> str:
121121
else "requests>=2.32.2"
122122
),
123123
("tqdm>=4.66.3,<=4.67.1" if BUILD_TYPE == "release" else "tqdm>=4.66.3"),
124-
("torch>=2.9.0,<=2.9.1" if BUILD_TYPE == "release" else "torch>=2.9.0,<=2.9.1"),
124+
("torch>=2.9.0,<=2.10.0" if BUILD_TYPE == "release" else "torch>=2.9.0"),
125125
(
126126
"transformers>=4.56.1,<=4.57.6"
127127
if BUILD_TYPE == "release"
@@ -163,10 +163,10 @@ def localversion_func(version: ScmVersion) -> str:
163163
"cmarkgfm>=2024.1.14",
164164
"trl>=0.10.1",
165165
"pandas<2.3.0",
166-
"torchvision<=0.24.1",
166+
"torchvision",
167167
"librosa==0.11.0",
168168
"soundfile",
169-
"torchcodec<=0.9.1",
169+
"torchcodec",
170170
# linting, formatting, and type checking
171171
"mypy~=1.10.0",
172172
"ruff~=0.4.8",

src/llmcompressor/modifiers/awq/base.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ class AWQModifier(Modifier, QuantizationMixin):
7676
balance_layers: ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
7777
- smooth_layer: "re:.*final_layer_norm"
7878
balance_layers: ["re:.*fc1"]
79+
# activation_hook_target specifies which submodule of the parent to hook
80+
# for activation caching.
81+
# This change is only useful for MoE models with parallel transformer blocks,
82+
# and one should use the default value (None) in most cases.
7983
ignore: ["lm_head"]
8084
config_groups:
8185
group_0:
@@ -122,6 +126,11 @@ class AWQModifier(Modifier, QuantizationMixin):
122126
to smoothed) and the second entry is the layer whose output is scaled to
123127
achieve the smoothing.
124128
If regex is used, it matches layers with the largest overlap in module name.
129+
Each mapping may also include an ``activation_hook_target``: a dotted
130+
attribute path relative to the parent module (lowest common ancestor)
131+
specifying which submodule to hook for activation caching. This is useful
132+
for parallel transformer blocks where the default (hooking
133+
``balance_layers[0]``) would capture the wrong activations.
125134
:param ignore: list of layers to ignore during quantization (not smoothed).
126135
It should match the name of layers whose outputs are scaled to achieve
127136
smoothing (the second entry of the mappings list).
@@ -389,6 +398,17 @@ def _set_resolved_mappings(self, model: Module) -> None:
389398
balance_names, model, torch.nn.ModuleList
390399
)
391400

401+
activation_hook_target = None
402+
if mapping.activation_hook_target:
403+
activation_hook_target = getattr_chain(
404+
ancestor, mapping.activation_hook_target
405+
)
406+
if activation_hook_target is None:
407+
raise ValueError(
408+
f"activation_hook_target '{mapping.activation_hook_target}'"
409+
f" not found on parent module '{ancestor_name}'"
410+
)
411+
392412
resolved_mappings.append(
393413
ResolvedMapping(
394414
smooth_name,
@@ -397,6 +417,7 @@ def _set_resolved_mappings(self, model: Module) -> None:
397417
balance_names=balance_names,
398418
parent=ancestor,
399419
parent_name=ancestor_name,
420+
activation_hook_target=activation_hook_target,
400421
)
401422
)
402423
self._resolved_mappings = resolved_mappings
@@ -468,16 +489,14 @@ def cache_smooth_activations_hook(
468489
# input activations to balance layers needed for loss function
469490
# storing inputs to first balance layer is sufficient
470491
# other balance layers get the same input
471-
472-
# The line below is useful for models that use parallel transformer block,
473-
# such as gemma 3, command A. Need a better way to integrate it to the code.
474-
# layer_to_hook = (
475-
# mapping.parent.mlp
476-
# if hasattr(mapping.parent, 'mlp')
477-
# else mapping.balance_layers[0]
478-
# )
492+
#
493+
# For parallel transformer blocks (e.g. Command A, Gemma 3) the first
494+
# balance layer may not receive the right activations. When
495+
# activation_hook_target is set on the mapping, hook that module
496+
# instead of balance_layers[0].
497+
layer_to_hook = mapping.activation_hook_target or mapping.balance_layers[0]
479498
self.register_hook(
480-
mapping.balance_layers[0],
499+
layer_to_hook,
481500
create_cache_smooth_activations_hook_fn(mapping.smooth_name),
482501
"forward",
483502
)
@@ -536,8 +555,6 @@ def _apply_smoothing(self, model: Module) -> None:
536555
orig_layer_weights = {
537556
balance_layer: balance_layer.weight.clone()
538557
for balance_layer in mapping.balance_layers
539-
if hasattr(balance_layer, "quantization_scheme")
540-
and hasattr(balance_layer.quantization_scheme, "weights")
541558
}
542559

543560
best_scales = self._compute_best_scale(
@@ -687,11 +704,9 @@ def _compute_best_scale(
687704
else:
688705
scales = x_mean.pow(ratio).clamp(min=1e-4).view(-1)
689706
scales = scales / (scales.max() * scales.min()).sqrt()
690-
_scalesview = scales.view(1, -1).to(device)
691-
692-
# avoid scaling values that overflow
693707
scales[torch.isinf(scales)] = 1
694708
scales[torch.isnan(scales)] = 1
709+
_scalesview = scales.view(1, -1).to(device)
695710

696711
# Q(W * s)
697712
for balance_layer in balance_layers_to_patch:

0 commit comments

Comments
 (0)