Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -67,43 +67,43 @@ def create_mod_info_recursion(parent):

_mod_default_dict = {
"Matmul": ModuleInfo("matmul", PatchedMatmul),
"Linear": ModuleInfo("linear", PatchedLinear),
"ParallelLMHead": ModuleInfo("linear", PatchedParallelLMHead),
"RowParallelLinear": ModuleInfo("row_parallel_linear", PatchedRowParallelLinear),
"ColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),
"MergedColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),
"QKVParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),
"FalconLinear": ModuleInfo("linear", PatchedLinear),
"Linear": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),
"ParallelLMHead": ModuleInfo("linear", PatchedParallelLMHead, supports_dynamic_quantization=True),
"RowParallelLinear": ModuleInfo("row_parallel_linear", PatchedRowParallelLinear, supports_dynamic_quantization=True),
"ColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear, supports_dynamic_quantization=True),
"MergedColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear, supports_dynamic_quantization=True),
"QKVParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear, supports_dynamic_quantization=True),
"FalconLinear": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),
"KVCache": ModuleInfo("kv_cache", PatchedKVCache),
"VLLMKVCache": ModuleInfo("kv_cache", PatchedVLLMKVCache),
"Conv2d": ModuleInfo("linear", PatchedConv2d),
"LoRACompatibleLinear": ModuleInfo("linear", PatchedLoRACompatibleLinear),
"LoRACompatibleLinear": ModuleInfo("linear", PatchedLoRACompatibleLinear, supports_dynamic_quantization=True),
"LoRACompatibleConv": ModuleInfo("linear", PatchedLoRACompatibleConv),
"Softmax": ModuleInfo("softmax", PatchedSoftmax),
"BlockSoftmaxConstMax": ModuleInfo("softmax", PatchedBlockSoftmaxConstMax),
"ModuleFusedSDPA": ModuleInfo("fused_sdpa", PatchedModuleFusedSDPA),
"MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul),
"MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul),
"ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear),
"MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul, supports_dynamic_quantization=True),
"MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul, supports_dynamic_quantization=True),
"ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear, supports_dynamic_quantization=True),
# Note: `no_quantize_op` indicates that this module is patched but does not require measurement or quantization.
"FusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False),
"SharedFusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False),
"FusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False, supports_dynamic_quantization=True),
"SharedFusedMoE": ModuleInfo("no_quantize_op", PatchedMixtralMoE, False, supports_dynamic_quantization=True),
"GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),
"GaudiDeepseekV3MoE": ModuleInfo("dynamic_moe", PatchedGaudiDeepseekV3MoE),
"GaudiFP8Linear": ModuleInfo("linear", PatchedMoeFP8Matmul),
"VllmMixtureOfExpertsOp": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOp),
"VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8),
"VllmMixtureOfExpertsOpFP8PerChannel": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8),
"VllmMixtureOfExpertsOp": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOp, supports_dynamic_quantization=True),
"VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8, supports_dynamic_quantization=True),
"VllmMixtureOfExpertsOpFP8PerChannel": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8, supports_dynamic_quantization=True),
}


if deepspeed_exists:
_mod_default_dict.update(
{
"LinearLayer": ModuleInfo("linear", PatchedLinear),
"LinearAllreduce": ModuleInfo("linear", PatchedLinearAllReduce),
"ScopedLinearAllReduce": ModuleInfo("linear", PatchedLinearAllReduce),
"LmHeadLinearAllreduce": ModuleInfo("linear", PatchedLmHeadLinearAllreduce),
"LinearLayer": ModuleInfo("linear", PatchedLinear, supports_dynamic_quantization=True),
"LinearAllreduce": ModuleInfo("linear", PatchedLinearAllReduce, supports_dynamic_quantization=True),
"ScopedLinearAllReduce": ModuleInfo("linear", PatchedLinearAllReduce, supports_dynamic_quantization=True),
"LmHeadLinearAllreduce": ModuleInfo("linear", PatchedLmHeadLinearAllreduce, supports_dynamic_quantization=True),
}
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,16 @@ class DeviceForScalesType(Enum):
]

# TODO [SW-217813]: support dynamic quantization in all ops and remove
# TODO [SW-228723]: get a better way to list all linear ops, like set in ModuleInfo if supports dynamic
supported_dynamic_ops = ["Linear", "RowParallelLinear", "ColumnParallelLinear", "MergedColumnParallelLinear", "QKVParallelLinear", "FalconLinear", "LoRACompatibleLinear", "ReplicatedLinear", "LinearLayer", "LinearAllreduce", "ScopedLinearAllReduce", "LmHeadLinearAllreduce", "FusedMoE", "GaudiMixtralSparseMoeBlock", "VllmMixtureOfExpertsOp", "VllmMixtureOfExpertsOpFP8", "GaudiDeepseekV3MoE", "GaudiFP8Linear"]
from neural_compressor.torch.algorithms.fp8_quant.model_configs import get_patched_module_table, ModuleInfo

def is_supported_dynamic_op(op_str):
ret = op_str in supported_dynamic_ops
"""
Dynamically checks if the given op supports dynamic quantization
by looking up its ModuleInfo and checking for a 'supports_dynamic_quantization' attribute.
"""
patched_table = get_patched_module_table()
info = patched_table.get(op_str)
ret = getattr(info, "supports_dynamic_quantization", False) if info is not None else False
logger.trace("Checking if %s is supported for dynamic quantization: %s", op_str, ret)
return ret

Expand Down
6 changes: 4 additions & 2 deletions neural_compressor/torch/algorithms/fp8_quant/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,18 @@ class ModuleInfo:
Configures a relation between a ModuleType key (from `_mod_types` dict in `common.py`)
to a PatchedModule class.
"""
def __init__(self, type, patched_module, should_measure_and_quant=True):
def __init__(self, type, patched_module, should_measure_and_quant=True, *, supports_dynamic_quantization=False):
self.type = type
self.patched_module = patched_module
self.should_measure_and_quant = should_measure_and_quant
self.supports_dynamic_quantization = supports_dynamic_quantization

def __repr__(self):
return (
f"ModuleInfo(type={self.type}, "
f"patched_module={self.patched_module.__name__}), "
f"should_measure_and_quant={self.should_measure_and_quant}"
f"should_measure_and_quant={self.should_measure_and_quant}, "
f"supports_dynamic_quantization={self.supports_dynamic_quantization}"
)


Expand Down
Loading