@@ -67,43 +67,43 @@ def create_mod_info_recursion(parent):
6767
6868_mod_default_dict = {
6969 "Matmul" : ModuleInfo ("matmul" , PatchedMatmul ),
70- "Linear" : ModuleInfo ("linear" , PatchedLinear ),
71- "ParallelLMHead" : ModuleInfo ("linear" , PatchedParallelLMHead ),
72- "RowParallelLinear" : ModuleInfo ("row_parallel_linear" , PatchedRowParallelLinear ),
73- "ColumnParallelLinear" : ModuleInfo ("linear" , PatchedColumnParallelLinear ),
74- "MergedColumnParallelLinear" : ModuleInfo ("linear" , PatchedColumnParallelLinear ),
75- "QKVParallelLinear" : ModuleInfo ("linear" , PatchedColumnParallelLinear ),
76- "FalconLinear" : ModuleInfo ("linear" , PatchedLinear ),
70+ "Linear" : ModuleInfo ("linear" , PatchedLinear , supports_dynamic_quantization = True ),
71+ "ParallelLMHead" : ModuleInfo ("linear" , PatchedParallelLMHead , supports_dynamic_quantization = True ),
72+ "RowParallelLinear" : ModuleInfo ("row_parallel_linear" , PatchedRowParallelLinear , supports_dynamic_quantization = True ),
73+ "ColumnParallelLinear" : ModuleInfo ("linear" , PatchedColumnParallelLinear , supports_dynamic_quantization = True ),
74+ "MergedColumnParallelLinear" : ModuleInfo ("linear" , PatchedColumnParallelLinear , supports_dynamic_quantization = True ),
75+ "QKVParallelLinear" : ModuleInfo ("linear" , PatchedColumnParallelLinear , supports_dynamic_quantization = True ),
76+ "FalconLinear" : ModuleInfo ("linear" , PatchedLinear , supports_dynamic_quantization = True ),
7777 "KVCache" : ModuleInfo ("kv_cache" , PatchedKVCache ),
7878 "VLLMKVCache" : ModuleInfo ("kv_cache" , PatchedVLLMKVCache ),
7979 "Conv2d" : ModuleInfo ("linear" , PatchedConv2d ),
80- "LoRACompatibleLinear" : ModuleInfo ("linear" , PatchedLoRACompatibleLinear ),
80+ "LoRACompatibleLinear" : ModuleInfo ("linear" , PatchedLoRACompatibleLinear , supports_dynamic_quantization = True ),
8181 "LoRACompatibleConv" : ModuleInfo ("linear" , PatchedLoRACompatibleConv ),
8282 "Softmax" : ModuleInfo ("softmax" , PatchedSoftmax ),
8383 "BlockSoftmaxConstMax" : ModuleInfo ("softmax" , PatchedBlockSoftmaxConstMax ),
8484 "ModuleFusedSDPA" : ModuleInfo ("fused_sdpa" , PatchedModuleFusedSDPA ),
85- "MoeMatmul" : ModuleInfo ("linear" , PatchedMoeMatmul ),
86- "MoeFP8Matmul" : ModuleInfo ("linear" , PatchedMoeFP8Matmul ),
87- "ReplicatedLinear" : ModuleInfo ("linear" , PatchedReplicatedLinear ),
85+ "MoeMatmul" : ModuleInfo ("linear" , PatchedMoeMatmul , supports_dynamic_quantization = True ),
86+ "MoeFP8Matmul" : ModuleInfo ("linear" , PatchedMoeFP8Matmul , supports_dynamic_quantization = True ),
87+ "ReplicatedLinear" : ModuleInfo ("linear" , PatchedReplicatedLinear , supports_dynamic_quantization = True ),
8888 # Note: `no_quantize_op` indicates that this module is patched but does not require measurement or quantization.
89- "FusedMoE" : ModuleInfo ("no_quantize_op" , PatchedMixtralMoE , False ),
90- "SharedFusedMoE" : ModuleInfo ("no_quantize_op" , PatchedMixtralMoE , False ),
89+ "FusedMoE" : ModuleInfo ("no_quantize_op" , PatchedMixtralMoE , False , supports_dynamic_quantization = True ),
90+ "SharedFusedMoE" : ModuleInfo ("no_quantize_op" , PatchedMixtralMoE , False , supports_dynamic_quantization = True ),
9191 "GaudiMixtralSparseMoeBlock" : ModuleInfo ("dynamic_moe" , PatchedGaudiMixtralSparseMoeBlock ),
9292 "GaudiDeepseekV3MoE" : ModuleInfo ("dynamic_moe" , PatchedGaudiDeepseekV3MoE ),
9393 "GaudiFP8Linear" : ModuleInfo ("linear" , PatchedMoeFP8Matmul ),
94- "VllmMixtureOfExpertsOp" : ModuleInfo ("dynamic_moe" , PatchedVllmMixtureOfExpertsOp ),
95- "VllmMixtureOfExpertsOpFP8" : ModuleInfo ("dynamic_moe" , PatchedVllmMixtureOfExpertsOpFP8 ),
96- "VllmMixtureOfExpertsOpFP8PerChannel" : ModuleInfo ("dynamic_moe" , PatchedVllmMixtureOfExpertsOpFP8 ),
94+ "VllmMixtureOfExpertsOp" : ModuleInfo ("dynamic_moe" , PatchedVllmMixtureOfExpertsOp , supports_dynamic_quantization = True ),
95+ "VllmMixtureOfExpertsOpFP8" : ModuleInfo ("dynamic_moe" , PatchedVllmMixtureOfExpertsOpFP8 , supports_dynamic_quantization = True ),
96+ "VllmMixtureOfExpertsOpFP8PerChannel" : ModuleInfo ("dynamic_moe" , PatchedVllmMixtureOfExpertsOpFP8 , supports_dynamic_quantization = True ),
9797}
9898
9999
100100if deepspeed_exists :
101101 _mod_default_dict .update (
102102 {
103- "LinearLayer" : ModuleInfo ("linear" , PatchedLinear ),
104- "LinearAllreduce" : ModuleInfo ("linear" , PatchedLinearAllReduce ),
105- "ScopedLinearAllReduce" : ModuleInfo ("linear" , PatchedLinearAllReduce ),
106- "LmHeadLinearAllreduce" : ModuleInfo ("linear" , PatchedLmHeadLinearAllreduce ),
103+ "LinearLayer" : ModuleInfo ("linear" , PatchedLinear , supports_dynamic_quantization = True ),
104+ "LinearAllreduce" : ModuleInfo ("linear" , PatchedLinearAllReduce , supports_dynamic_quantization = True ),
105+ "ScopedLinearAllReduce" : ModuleInfo ("linear" , PatchedLinearAllReduce , supports_dynamic_quantization = True ),
106+ "LmHeadLinearAllreduce" : ModuleInfo ("linear" , PatchedLmHeadLinearAllreduce , supports_dynamic_quantization = True ),
107107 }
108108 )
109109
0 commit comments