File tree Expand file tree Collapse file tree 1 file changed +3
-3
lines changed
vllm/model_executor/layers/quantization Expand file tree Collapse file tree 1 file changed +3
-3
lines changed Original file line number Diff line number Diff line change @@ -450,10 +450,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
450
450
# Activations not quantized for marlin.
451
451
del layer .input_scale
452
452
453
- # On B200 , if E8M0 for DeepGemm is used, we need to
453
+ # On Blackwell or Hopper , if E8M0 for DeepGemm is used, we need to
454
454
# requantize the weight and input to the specific scale
455
455
# at the same time.
456
- if is_deep_gemm_e8m0_used ():
456
+ if is_deep_gemm_e8m0_used () and self . block_quant :
457
457
assert layer .weight_block_size is not None
458
458
block_sz = tuple (layer .weight_block_size )
459
459
requant_weight_ue8m0_inplace (
@@ -905,7 +905,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
905
905
del layer .w13_input_scale
906
906
del layer .w2_input_scale
907
907
908
- if is_deep_gemm_e8m0_used ():
908
+ if is_deep_gemm_e8m0_used () and self . block_quant :
909
909
assert layer .weight_block_size is not None
910
910
# Re-quantise the expert weights so their scales are UE8M0.
911
911
block_sz = tuple (layer .weight_block_size )
You can’t perform that action at this time.
0 commit comments