cherry-pick two internal PRs for vllm-gaudi (#2392)

xin3he · HolyFalafel · web-flow · commit 1b867f0391a3 · 2026-01-21T17:17:21.000+08:00
* [SW-228723] Added flag if op supports dynamic quant to ModuleInfo (#314) * [GAUDISW-244137] Set FusedMoE and VllmMixtureOfExperts to support dynamic quantization (#328) * Set FusedMoE to support dynamic quantization * added vllm moe --------- Co-authored-by: Danny Semiat <dsemiat@habana.ai>
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py
@@ -67,43 +67,43 @@ def create_mod_info_recursion(parent):
 
 _mod_default_dict = {
     "Matmul": ModuleInfo("matmul", PatchedMatmul),
-    "Linear": ModuleInfo("linear", PatchedLinear),
-    "ParallelLMHead": ModuleInfo("linear", PatchedParallelLMHead),
-    "RowParallelLinear": ModuleInfo("row_parallel_linear", PatchedRowParallelLinear),
-    "ColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),
-    "MergedColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),
-    "QKVParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),
-    "FalconLinear": ModuleInfo("linear", PatchedLinear),
+    "Linear": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),
+    "ParallelLMHead": ModuleInfo("linear", PatchedParallelLMHead, supports_dynamic_quantization=True),
+    "RowParallelLinear": ModuleInfo("row_parallel_linear", PatchedRowParallelLinear, supports_dynamic_quantization=True),
+    "ColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear, supports_dynamic_quantization=True),
+    "MergedColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear, supports_dynamic_quantization=True),
+    "QKVParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear, supports_dynamic_quantization=True),
+    "FalconLinear": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),
     "KVCache": ModuleInfo("kv_cache", PatchedKVCache),
     "VLLMKVCache": ModuleInfo("kv_cache", PatchedVLLMKVCache),
     "Conv2d": ModuleInfo("linear", PatchedConv2d),
-    "LoRACompatibleLinear": ModuleInfo("linear", PatchedLoRACompatibleLinear),
+    "LoRACompatibleLinear": ModuleInfo("linear", PatchedLoRACompatibleLinear, supports_dynamic_quantization=True),
     "LoRACompatibleConv": ModuleInfo("linear", PatchedLoRACompatibleConv),
     "Softmax": ModuleInfo("softmax", PatchedSoftmax),
     "BlockSoftmaxConstMax": ModuleInfo("softmax", PatchedBlockSoftmaxConstMax),
     "ModuleFusedSDPA": ModuleInfo("fused_sdpa", PatchedModuleFusedSDPA),
-    "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul),
-    "MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul),
-    "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear),
+    "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul, supports_dynamic_quantization=True),
+    "MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul, supports_dynamic_quantization=True),
+    "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear, supports_dynamic_quantization=True),
     # Note: `no_quantize_op` indicates that this module is patched but does not require measurement or quantization.
-    "FusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False),
-    "SharedFusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False),
+    "FusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False, supports_dynamic_quantization=True),
+    "SharedFusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False, supports_dynamic_quantization=True),
     "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),
     "GaudiDeepseekV3MoE": ModuleInfo("dynamic_moe", PatchedGaudiDeepseekV3MoE),
     "GaudiFP8Linear": ModuleInfo("linear", PatchedMoeFP8Matmul),
-    "VllmMixtureOfExpertsOp": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOp),
-    "VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8),
-    "VllmMixtureOfExpertsOpFP8PerChannel": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8),
+    "VllmMixtureOfExpertsOp": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOp, supports_dynamic_quantization=True),
+    "VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8, supports_dynamic_quantization=True),
+    "VllmMixtureOfExpertsOpFP8PerChannel": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8, supports_dynamic_quantization=True),
 }
 
 
 if deepspeed_exists:
     _mod_default_dict.update(
         {
-            "LinearLayer": ModuleInfo("linear", PatchedLinear),
-            "LinearAllreduce": ModuleInfo("linear", PatchedLinearAllReduce),
-            "ScopedLinearAllReduce": ModuleInfo("linear", PatchedLinearAllReduce),
-            "LmHeadLinearAllreduce": ModuleInfo("linear", PatchedLmHeadLinearAllreduce),
+            "LinearLayer": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),
+            "LinearAllreduce": ModuleInfo("linear", PatchedLinearAllReduce, supports_dynamic_quantization=True),
+            "ScopedLinearAllReduce": ModuleInfo("linear", PatchedLinearAllReduce, supports_dynamic_quantization=True),
+            "LmHeadLinearAllreduce": ModuleInfo("linear", PatchedLmHeadLinearAllreduce, supports_dynamic_quantization=True),
         }
     )
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -98,10 +98,16 @@ class DeviceForScalesType(Enum):
 ]
 
 # TODO [SW-217813]: support dynamic quantization in all ops and remove
-# TODO [SW-228723]: get a better way to list all linear ops, like set in ModuleInfo if supports dynamic
-supported_dynamic_ops = ["Linear", "RowParallelLinear", "ColumnParallelLinear", "MergedColumnParallelLinear", "QKVParallelLinear", "FalconLinear", "LoRACompatibleLinear", "ReplicatedLinear", "LinearLayer", "LinearAllreduce", "ScopedLinearAllReduce", "LmHeadLinearAllreduce", "FusedMoE", "GaudiMixtralSparseMoeBlock", "VllmMixtureOfExpertsOp", "VllmMixtureOfExpertsOpFP8", "GaudiDeepseekV3MoE", "GaudiFP8Linear"]
+from neural_compressor.torch.algorithms.fp8_quant.model_configs import get_patched_module_table, ModuleInfo
+
 def is_supported_dynamic_op(op_str):
-    ret = op_str in supported_dynamic_ops
+    """
+    Dynamically checks if the given op supports dynamic quantization
+    by looking up its ModuleInfo and checking for a 'supports_dynamic_quantization' attribute.
+    """
+    patched_table = get_patched_module_table()
+    info = patched_table.get(op_str)
+    ret = getattr(info, "supports_dynamic_quantization", False) if info is not None else False
     logger.trace("Checking if %s is supported for dynamic quantization: %s", op_str, ret)
     return ret
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/model_configs.py b/neural_compressor/torch/algorithms/fp8_quant/model_configs.py
@@ -40,16 +40,18 @@ class ModuleInfo:
     Configures a relation between a ModuleType key (from `_mod_types` dict in `common.py`)
     to a PatchedModule class.
     """
-    def __init__(self, type, patched_module, should_measure_and_quant=True):
+    def __init__(self, type, patched_module, should_measure_and_quant=True, *, supports_dynamic_quantization=False):
         self.type = type
         self.patched_module = patched_module
         self.should_measure_and_quant = should_measure_and_quant
+        self.supports_dynamic_quantization = supports_dynamic_quantization
 
     def __repr__(self):
         return (
             f"ModuleInfo(type={self.type}, "
             f"patched_module={self.patched_module.__name__}), "
-            f"should_measure_and_quant={self.should_measure_and_quant}"
+            f"should_measure_and_quant={self.should_measure_and_quant}, "
+            f"supports_dynamic_quantization={self.supports_dynamic_quantization}"
         )