vllm-project · dsikka · Sep 25, 2025 · Sep 24, 2025 · Sep 24, 2025 · Sep 25, 2025
diff --git a/examples/quantization_w4a4_fp4/qwen_30b_a3b.py b/examples/quantization_w4a4_fp4/qwen_30b_a3b.py
@@ -16,7 +16,7 @@
 DATASET_SPLIT = "train_sft"
 
 # Select number of samples
-NUM_CALIBRATION_SAMPLES = 200
+NUM_CALIBRATION_SAMPLES = 20
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -84,7 +84,6 @@ def tokenize(sample):
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
 
-
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
 model.save_pretrained(SAVE_DIR, save_compressed=True)

@@ -62,7 +62,7 @@ def update_qwen3_moe(model, stack, calibrate_all_experts):
 def moe_calibration_context(
     model: PreTrainedModel,
     stack,
-    calibrate_all_experts: bool = False,
+    calibrate_all_experts: bool = True,
 ):
     # Temporarily updates the MoE modules within the context
     # Once the context exists, parameter updates persist