Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions modelopt/torch/export/quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,11 @@ def to_quantized_weight(
if isinstance(weight, QTensorWrapper):
return weight.data

if weight.dim() == 3:
# for MOE stacked weights
# Clear GPU cache to avoid pontential GPU OOM issues for large models.
clear_cuda_cache()

if quantization == QUANTIZATION_FP8:
# Fix RuntimeError: Promotion for Float8 Types is not supported, attempted to promote Float8_e4m3fn and Float
# in speculative decoding fp8 model export
Expand All @@ -764,9 +769,6 @@ def to_quantized_weight(
return weight

if weight.dim() == 3:
# for MOE stacked weights
# Clear GPU cache to avoid pontential GPU OOM issues for large models.
clear_cuda_cache()
return (weight / weights_scaling_factor.unsqueeze(-1)).to(torch.float8_e4m3fn)
return (weight / weights_scaling_factor).to(torch.float8_e4m3fn)

Expand Down