neural_compressor/torch/algorithms/fp8_quant/_core/patching_common.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -67,43 +67,43 @@ def create_mod_info_recursion(parent):
  
    _mod_default_dict = {

        "Matmul": ModuleInfo("matmul", PatchedMatmul),

        "Linear": ModuleInfo("linear", PatchedLinear),

        "ParallelLMHead": ModuleInfo("linear", PatchedParallelLMHead),

        "RowParallelLinear": ModuleInfo("row_parallel_linear", PatchedRowParallelLinear),

        "ColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),

        "MergedColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),

        "QKVParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),

        "FalconLinear": ModuleInfo("linear", PatchedLinear),

        "Linear": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),

        "ParallelLMHead": ModuleInfo("linear", PatchedParallelLMHead, supports_dynamic_quantization=True),

        "RowParallelLinear": ModuleInfo("row_parallel_linear", PatchedRowParallelLinear, supports_dynamic_quantization=True),

        "ColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear, supports_dynamic_quantization=True),

        "MergedColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear, supports_dynamic_quantization=True),

        "QKVParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear, supports_dynamic_quantization=True),

        "FalconLinear": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),

        "KVCache": ModuleInfo("kv_cache", PatchedKVCache),

        "VLLMKVCache": ModuleInfo("kv_cache", PatchedVLLMKVCache),

        "Conv2d": ModuleInfo("linear", PatchedConv2d),

        "LoRACompatibleLinear": ModuleInfo("linear", PatchedLoRACompatibleLinear),

        "LoRACompatibleLinear": ModuleInfo("linear", PatchedLoRACompatibleLinear, supports_dynamic_quantization=True),

        "LoRACompatibleConv": ModuleInfo("linear", PatchedLoRACompatibleConv),

        "Softmax": ModuleInfo("softmax", PatchedSoftmax),

        "BlockSoftmaxConstMax": ModuleInfo("softmax", PatchedBlockSoftmaxConstMax),

        "ModuleFusedSDPA": ModuleInfo("fused_sdpa", PatchedModuleFusedSDPA),

        "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul),

        "MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul),

        "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear),

        "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul, supports_dynamic_quantization=True),

        "MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul, supports_dynamic_quantization=True),

        "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear, supports_dynamic_quantization=True),

        # Note: `no_quantize_op` indicates that this module is patched but does not require measurement or quantization.

        "FusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False),

        "SharedFusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False),

        "FusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False, supports_dynamic_quantization=True),

        "SharedFusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False, supports_dynamic_quantization=True),

        "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),

        "GaudiDeepseekV3MoE": ModuleInfo("dynamic_moe", PatchedGaudiDeepseekV3MoE),

        "GaudiFP8Linear": ModuleInfo("linear", PatchedMoeFP8Matmul),

        "VllmMixtureOfExpertsOp": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOp),

        "VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8),

        "VllmMixtureOfExpertsOpFP8PerChannel": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8),

        "VllmMixtureOfExpertsOp": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOp, supports_dynamic_quantization=True),

        "VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8, supports_dynamic_quantization=True),

        "VllmMixtureOfExpertsOpFP8PerChannel": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8, supports_dynamic_quantization=True),

    }

    if deepspeed_exists:

        _mod_default_dict.update(

            {

                "LinearLayer": ModuleInfo("linear", PatchedLinear),

                "LinearAllreduce": ModuleInfo("linear", PatchedLinearAllReduce),

                "ScopedLinearAllReduce": ModuleInfo("linear", PatchedLinearAllReduce),

                "LmHeadLinearAllreduce": ModuleInfo("linear", PatchedLmHeadLinearAllreduce),

                "LinearLayer": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),

                "LinearAllreduce": ModuleInfo("linear", PatchedLinearAllReduce, supports_dynamic_quantization=True),

                "ScopedLinearAllReduce": ModuleInfo("linear", PatchedLinearAllReduce, supports_dynamic_quantization=True),

                "LmHeadLinearAllreduce": ModuleInfo("linear", PatchedLmHeadLinearAllreduce, supports_dynamic_quantization=True),

            }

        )

neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -98,10 +98,16 @@ class DeviceForScalesType(Enum):
  
    ]

    # TODO [SW-217813]: support dynamic quantization in all ops and remove

    # TODO [SW-228723]: get a better way to list all linear ops, like set in ModuleInfo if supports dynamic

    supported_dynamic_ops = ["Linear", "RowParallelLinear", "ColumnParallelLinear", "MergedColumnParallelLinear", "QKVParallelLinear", "FalconLinear", "LoRACompatibleLinear", "ReplicatedLinear", "LinearLayer", "LinearAllreduce", "ScopedLinearAllReduce", "LmHeadLinearAllreduce", "FusedMoE", "GaudiMixtralSparseMoeBlock", "VllmMixtureOfExpertsOp", "VllmMixtureOfExpertsOpFP8", "GaudiDeepseekV3MoE", "GaudiFP8Linear"]

    from neural_compressor.torch.algorithms.fp8_quant.model_configs import get_patched_module_table, ModuleInfo

    def is_supported_dynamic_op(op_str):

        ret = op_str in supported_dynamic_ops

        """

        Dynamically checks if the given op supports dynamic quantization

        by looking up its ModuleInfo and checking for a 'supports_dynamic_quantization' attribute.

        """

        patched_table = get_patched_module_table()

        info = patched_table.get(op_str)

        ret = getattr(info, "supports_dynamic_quantization", False) if info is not None else False

        logger.trace("Checking if %s is supported for dynamic quantization: %s", op_str, ret)

        return ret

neural_compressor/torch/algorithms/fp8_quant/model_configs.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -40,16 +40,18 @@ class ModuleInfo: @@
         Configures a relation between a ModuleType key (from `_mod_types` dict in `common.py`)
         to a PatchedModule class.
         """
-        def __init__(self, type, patched_module, should_measure_and_quant=True):
+        def __init__(self, type, patched_module, should_measure_and_quant=True, *, supports_dynamic_quantization=False):
             self.type = type
             self.patched_module = patched_module
             self.should_measure_and_quant = should_measure_and_quant
+            self.supports_dynamic_quantization = supports_dynamic_quantization
         def __repr__(self):
             return (
                 f"ModuleInfo(type={self.type}, "
                 f"patched_module={self.patched_module.__name__}), "
-                f"should_measure_and_quant={self.should_measure_and_quant}"
+                f"should_measure_and_quant={self.should_measure_and_quant}, "
+                f"supports_dynamic_quantization={self.supports_dynamic_quantization}"
             )
@@ Expand Down @@

cherry-pick two internal PRs for vllm-gaudi #2392

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

xin3he merged 2 commits into master from xinhe/vllm-gaudi

Jan 21, 2026

-Original file line number
+Diff line change
@@ Expand Up / @@ -40,16 +40,18 @@ class ModuleInfo: @@
         Configures a relation between a ModuleType key (from `_mod_types` dict in `common.py`)
         to a PatchedModule class.
         """
-        def __init__(self, type, patched_module, should_measure_and_quant=True):
+        def __init__(self, type, patched_module, should_measure_and_quant=True, *, supports_dynamic_quantization=False):
             self.type = type
             self.patched_module = patched_module
             self.should_measure_and_quant = should_measure_and_quant
+            self.supports_dynamic_quantization = supports_dynamic_quantization
         def __repr__(self):
             return (
                 f"ModuleInfo(type={self.type}, "
                 f"patched_module={self.patched_module.__name__}), "
-                f"should_measure_and_quant={self.should_measure_and_quant}"
+                f"should_measure_and_quant={self.should_measure_and_quant}, "
+                f"supports_dynamic_quantization={self.supports_dynamic_quantization}"
             )
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cherry-pick two internal PRs for vllm-gaudi #2392

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!