Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/linkcheck.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ on:
branches: [ main, 'release/*' ]
pull_request:
branches: [ main, 'release/*' ]
merge_group:

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/quality-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
branches: [ main , 'release/*' ]
pull_request:
branches: [ main, 'release/*' ]
merge_group:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand Down
11 changes: 9 additions & 2 deletions .github/workflows/ready-label-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ on:
- opened
- reopened
- synchronize
merge_group:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand All @@ -18,14 +19,20 @@ jobs:
ready-label-check:
runs-on: gcp-k8s-vllm-util
steps:
- name: Auto-succeed for merge queue
if: github.event_name == 'merge_group'
run: |
echo "::info::Merge queue event - automatically passing ready label check"
exit 0

- name: Fail if ready label has not been applied to PR
if: "!contains(github.event.pull_request.labels.*.name, 'ready')"
if: "github.event_name != 'merge_group' && !contains(github.event.pull_request.labels.*.name, 'ready')"
run: |
echo "::error::The PR is not labeled as 'ready'"
exit 1

- name: Succeed if ready label has been applied to PR
if: contains(github.event.pull_request.labels.*.name, 'ready')
if: "github.event_name != 'merge_group' && contains(github.event.pull_request.labels.*.name, 'ready')"
run: |
echo "::info::The PR is labeled as 'ready'"
exit 0
1 change: 1 addition & 0 deletions .github/workflows/test-check-transformers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ on:
types: [ labeled, synchronize ]
push:
branches: [ main, 'release/*' ]
merge_group:
workflow_dispatch:
inputs:
code_coverage:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/test-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
branches: [ main, 'release/*' ]
push:
branches: [ main, 'release/*' ]
merge_group:
workflow_dispatch:
inputs:
code_coverage:
Expand Down
87 changes: 87 additions & 0 deletions examples/quantization_w4a4_fp4/qwen3_5_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from transformers import AutoProcessor, Qwen3_5MoeForConditionalGeneration
from datasets import load_dataset
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
import torch

MODEL_ID = "/raid/engine/dsikka/models--Qwen--Qwen3.5-397B-A17B/snapshots/7cad2bae11cb49ca79f7d6a0954de2e2756f4e27"

# Load model.
model = Qwen3_5MoeForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto")
processor = AutoProcessor.from_pretrained(MODEL_ID)


recipe = QuantizationModifier(
targets="Linear",
scheme="NVFP4",
ignore=[
"re:.*lm_head",
"re:visual.*",
"re:model.visual.*",
"re:.*mlp.gate$",
"re:.*embed_tokens$",
"re:.*shared_expert_gate$",
"re:.*mlp\\.shared_expert$",
"re:.*linear_attn.*",
],
)

DATASET_ID = "neuralmagic/calibration"
NUM_CALIBRATION_SAMPLES = 20
MAX_SEQUENCE_LENGTH = 8192

ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")


def preprocess_function(example):
messgages = []
for message in example["messages"]:
messgages.append(
{
"role": message["role"],
"content": [{"type": "text", "text": message["content"]}],
}
)

return processor.apply_chat_template(
messgages,
Comment on lines +37 to +47
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

There is a typo in the variable name messgages. It should be messages. This typo appears multiple times within the preprocess_function.

Suggested change
messgages = []
for message in example["messages"]:
messgages.append(
{
"role": message["role"],
"content": [{"type": "text", "text": message["content"]}],
}
)
return processor.apply_chat_template(
messgages,
messages = []
for message in example["messages"]:
messages.append(
{
"role": message["role"],
"content": [{"type": "text", "text": message["content"]}],
}
)
return processor.apply_chat_template(
messages,

return_tensors="pt",
padding=False,
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
tokenize=True,
add_special_tokens=False,
return_dict=True,
add_generation_prompt=False,
)


ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)


def data_collator(batch):
assert len(batch) == 1
return {
key: (
torch.tensor(value)
if key != "pixel_values"
else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
)
for key, value in batch[0].items()
}



# Apply quantization.
oneshot(model=model,
recipe=recipe,
dataset=ds,
data_collator=data_collator,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
moe_calibrate_all_experts=True)

# Save to disk in compressed-tensors format.
SAVE_DIR = "/raid/engine/dsikka/" + "Qwen3.5-397B-A17B" + "-NVFP4"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using + for string concatenation to build a file path can be fragile and less readable. It's better to define the full path as a single string literal for clarity, or use os.path.join for better portability (which would require importing os).

Suggested change
SAVE_DIR = "/raid/engine/dsikka/" + "Qwen3.5-397B-A17B" + "-NVFP4"
SAVE_DIR = "/raid/engine/dsikka/Qwen3.5-397B-A17B-NVFP4"

model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
38 changes: 38 additions & 0 deletions examples/quantization_w8a8_fp8/qwen3_5_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from transformers import AutoProcessor, Qwen3_5MoeForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

MODEL_ID = "/raid/engine/dsikka/models--Qwen--Qwen3.5-397B-A17B/snapshots/7cad2bae11cb49ca79f7d6a0954de2e2756f4e27"

# Load model.
model = Qwen3_5MoeForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto")
processor = AutoProcessor.from_pretrained(MODEL_ID)

# Configure the quantization algorithm and scheme.
# In this case, we:
# * quantize the weights to fp8 with channel-wise quantization
# * quantize the activations to fp8 with dynamic token activations
# NOTE: only datafree quantization is supported for Qwen3-VL-MoE currently
recipe = QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
ignore=[
"re:.*lm_head",
"re:visual.*",
"re:model.visual.*",
"re:.*mlp.gate$",
"re:.*embed_tokens$",
"re:.*shared_expert_gate$",
"re:.*mlp\\.shared_expert$",
"re:.*linear_attn.*",
],
)

# Apply quantization.
oneshot(model=model, recipe=recipe)

# Save to disk in compressed-tensors format.
SAVE_DIR = "/raid/engine/dsikka/" + "Qwen3.5-397B-A17B" + "-FP8-Dynamic-NoLinearAttn"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using + for string concatenation to build a file path can be fragile and less readable. It's better to define the full path as a single string literal for clarity, or use os.path.join for better portability (which would require importing os).

Suggested change
SAVE_DIR = "/raid/engine/dsikka/" + "Qwen3.5-397B-A17B" + "-FP8-Dynamic-NoLinearAttn"
SAVE_DIR = "/raid/engine/dsikka/Qwen3.5-397B-A17B-FP8-Dynamic-NoLinearAttn"

model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
1 change: 1 addition & 0 deletions src/llmcompressor/modeling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .qwen3_moe import CalibrationQwen3MoeSparseMoeBlock # noqa: F401
from .qwen3_vl_moe import CalibrateQwen3VLMoeTextSparseMoeBlock # noqa: F401
from .qwen3_next_moe import CalibrationQwen3NextSparseMoeBlock # noqa: F401
from .qwen3_5_vl_moe import CalibrateQwen3_5MoeTextSparseMoeBlock # noqa: F401
# TODO: add granite4, Qwen3Next

from .fuse import *
124 changes: 124 additions & 0 deletions src/llmcompressor/modeling/qwen3_5_vl_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import torch
from transformers import Qwen3_5MoeConfig, Qwen3_5MoeTextConfig
from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
Qwen3_5MoeSparseMoeBlock,
)

from llmcompressor.modeling.moe_context import MoECalibrationModule
from llmcompressor.utils.dev import skip_weights_initialize
import torch.nn.functional as F


@MoECalibrationModule.register("Qwen3_5MoeSparseMoeBlock")
class CalibrateQwen3_5MoeTextSparseMoeBlock(MoECalibrationModule):
"""
Calibration version of Qwen3_5MoeSparseMoeBlock that sends all tokens to all
experts.
"""

is_permanent = True

def __init__(
self,
original: "Qwen3_5MoeSparseMoeBlock",
config: "Qwen3_5MoeConfig",
calibrate_all_experts: bool = True,
):
super().__init__()
text_config: "Qwen3_5MoeTextConfig" = config.get_text_config()

self.num_experts = text_config.num_experts

self.shared_expert = original.shared_expert
self.shared_expert_gate = original.shared_expert_gate
self.gate = original.gate
self.experts = SequentialQwen3VLMoeTextExperts(text_config, original.experts)
self.calibrate_all_experts = calibrate_all_experts

def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
batch_size, sequence_length, hidden_dim = hidden_states.shape
hidden_states_reshaped = hidden_states.view(-1, hidden_dim)

# router: returns (router_logits, router_scores, router_indices)
_, routing_weights, selected_experts = self.gate(hidden_states_reshaped)

# expert mask: (num_experts, top_k, num_tokens)
expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).permute(
2, 1, 0
)

final_hidden_states = torch.zeros(
(batch_size * sequence_length, hidden_dim),
dtype=hidden_states.dtype,
device=hidden_states.device,
)

for expert_idx, expert_layer in enumerate(self.experts):
idx, token_idx = torch.where(expert_mask[expert_idx])

if self.calibrate_all_experts:
expert_out = expert_layer(hidden_states_reshaped)[token_idx]
else:
expert_out = expert_layer(hidden_states_reshaped[token_idx])

if len(token_idx) > 0:
current_hidden_states = (
expert_out * routing_weights[token_idx, idx, None]
)
final_hidden_states.index_add_(
0,
token_idx,
current_hidden_states.to(hidden_states.dtype),
)

# shared expert
shared_expert_output = self.shared_expert(hidden_states_reshaped)
shared_expert_output = (
F.sigmoid(self.shared_expert_gate(hidden_states_reshaped))
* shared_expert_output
)
final_hidden_states = final_hidden_states + shared_expert_output

final_hidden_states = final_hidden_states.reshape(
batch_size, sequence_length, hidden_dim
)
return final_hidden_states

def restore(self, original: torch.nn.Module) -> torch.nn.Module:
return original


class SequentialQwen3VLMoeTextExperts(torch.nn.ModuleList):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

There's a naming inconsistency. This class name SequentialQwen3VLMoeTextExperts and the filename qwen3_5_vl_moe.py suggest a Vision-Language model. However, the code in this file seems to target the non-VL Qwen3_5Moe model. This is likely a copy-paste artifact. For clarity, this class should be renamed to SequentialQwen3_5MoeTextExperts. You'll also need to update its usage in CalibrateQwen3_5MoeTextSparseMoeBlock.__init__ on line 35.

Suggested change
class SequentialQwen3VLMoeTextExperts(torch.nn.ModuleList):
class SequentialQwen3_5MoeTextExperts(torch.nn.ModuleList):

def __init__(self, config, original):
from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
Qwen3_5MoeMLP,
)
from compressed_tensors.offload import disable_onloading

self.num_experts = original.gate_up_proj.shape[0]
with skip_weights_initialize():
super().__init__(
[
Qwen3_5MoeMLP(
config, intermediate_size=config.shared_expert_intermediate_size
)
for _ in range(self.num_experts)
]
)

intermediate_size = original.down_proj.shape[-1]

with disable_onloading():
gate_up_data = original.gate_up_proj.data # [num_experts, 2*inter, hidden]
down_data = original.down_proj.data # [num_experts, hidden, inter]

for i in range(self.num_experts):
gate_up = gate_up_data[i]
down = down_data[i]

gate_proj = gate_up[:intermediate_size, :]
up_proj = gate_up[intermediate_size:, :]

self[i].gate_proj.weight.data = gate_proj.clone().contiguous()
self[i].up_proj.weight.data = up_proj.clone().contiguous()
self[i].down_proj.weight.data = down.clone().contiguous()
22 changes: 21 additions & 1 deletion src/llmcompressor/utils/dev.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from loguru import logger
from safetensors.torch import save_file
from transformers import AutoModelForCausalLM, PreTrainedModel
from transformers.modeling_utils import TORCH_INIT_FUNCTIONS

# from transformers.modeling_utils import TORCH_INIT_FUNCTIONS
from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, WEIGHTS_INDEX_NAME

__all__ = [
Expand All @@ -22,6 +23,25 @@
"dispatch_for_generation",
]

from torch import nn

TORCH_INIT_FUNCTIONS = {
"uniform_": nn.init.uniform_,
"normal_": nn.init.normal_,
"trunc_normal_": nn.init.trunc_normal_,
"constant_": nn.init.constant_,
"xavier_uniform_": nn.init.xavier_uniform_,
"xavier_normal_": nn.init.xavier_normal_,
"kaiming_uniform_": nn.init.kaiming_uniform_,
"kaiming_normal_": nn.init.kaiming_normal_,
"uniform": nn.init.uniform,
"normal": nn.init.normal,
"xavier_uniform": nn.init.xavier_uniform,
"xavier_normal": nn.init.xavier_normal,
"kaiming_uniform": nn.init.kaiming_uniform,
"kaiming_normal": nn.init.kaiming_normal,
}


@contextlib.contextmanager
def skip_weights_download(model_class: Type[PreTrainedModel] = AutoModelForCausalLM):
Expand Down
2 changes: 1 addition & 1 deletion src/llmcompressor/utils/pytorch/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def get_no_split_params(model: PreTrainedModel) -> Union[str, List[str]]:

:return: list of class names that shouldn't be split
"""
no_split_modules = model._get_no_split_modules("auto")
no_split_modules = model._no_split_modules
if len(no_split_modules) <= 0:
return ALL_TARGET

Expand Down
Loading