update

dsikka · dsikka · commit fe4e28a54150 · 2026-02-22T00:26:38.000-05:00
diff --git a/examples/quantization_w4a4_fp4/qwen3_5_moe.py b/examples/quantization_w4a4_fp4/qwen3_5_moe.py
@@ -2,6 +2,7 @@
 from datasets import load_dataset
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+import torch
 
 MODEL_ID = "/raid/engine/dsikka/models--Qwen--Qwen3.5-397B-A17B/snapshots/7cad2bae11cb49ca79f7d6a0954de2e2756f4e27"
 
@@ -25,48 +26,57 @@
     ],
 )
 
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-
-# Select number of samples
+DATASET_ID = "neuralmagic/calibration"
 NUM_CALIBRATION_SAMPLES = 20
-MAX_SEQUENCE_LENGTH = 2048
+MAX_SEQUENCE_LENGTH = 8192
 
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
+ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
 
 
-def preprocess(example):
-    return {
-        "text": processor.apply_chat_template(
-            example["messages"],
-            tokenize=False,
+def preprocess_function(example):
+    messgages = []
+    for message in example["messages"]:
+        messgages.append(
+            {
+                "role": message["role"],
+                "content": [{"type": "text", "text": message["content"]}],
+            }
         )
-    }
-
 
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return processor(
-        sample["text"],
+    return processor.apply_chat_template(
+        messgages,
+        return_tensors="pt",
         padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
         truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+        tokenize=True,
         add_special_tokens=False,
+        return_dict=True,
+        add_generation_prompt=False,
     )
 
 
-ds = ds.map(tokenize, remove_columns=ds.column_names)
+ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
+
+
+def data_collator(batch):
+    assert len(batch) == 1
+    return {
+        key: (
+            torch.tensor(value)
+            if key != "pixel_values"
+            else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        )
+        for key, value in batch[0].items()
+    }
+
 
 
 # Apply quantization.
 oneshot(model=model, 
     recipe=recipe, 
-    dataset=ds,     
+    dataset=ds,
+    data_collator=data_collator,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     moe_calibrate_all_experts=True)
diff --git a/src/llmcompressor/modeling/qwen3_5_vl_moe.py b/src/llmcompressor/modeling/qwen3_5_vl_moe.py
@@ -22,7 +22,7 @@ def __init__(
         self,
         original: "Qwen3_5MoeSparseMoeBlock",
         config: "Qwen3_5MoeConfig",
-        calibrate_all_experts: bool,
+        calibrate_all_experts: bool = True,
     ):
         super().__init__()
         text_config: "Qwen3_5MoeTextConfig" = config.get_text_config()
@@ -33,6 +33,7 @@ def __init__(
         self.shared_expert_gate = original.shared_expert_gate
         self.gate = original.gate
         self.experts = SequentialQwen3VLMoeTextExperts(text_config, original.experts)
+        self.calibrate_all_experts = calibrate_all_experts
     
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py
@@ -337,7 +337,7 @@ def get_no_split_params(model: PreTrainedModel) -> Union[str, List[str]]:
 
     :return: list of class names that shouldn't be split
     """
-    no_split_modules = model._get_no_split_modules("auto")
+    no_split_modules = model._no_split_modules
     if len(no_split_modules) <= 0:
         return ALL_TARGET