Porting online convert for llm-compressor (vllm-project#1763)

yiliu30 · czhu15 · commit e9f568ce3618 · 2025-08-18T09:45:41.000+08:00
## Usage ``` export VLLM_HPU_CONVERT_TO_FP8UZ=1 export VLLM_HPU_FORCE_CHANNEL_FP8=1 ``` Original PR HabanaAI#1505 @czhu15 @Wei-Lin-Intel @yangulei Please help review, thx! --------- Signed-off-by: yiliu30 <yi4.liu@intel.com>
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -5,14 +5,15 @@
 from typing import Any, Literal, Optional, cast
 
 import torch
+import vllm.envs as envs
 from compressed_tensors.config import (CompressionFormat,
                                        SparsityCompressionConfig,
                                        SparsityStructure)
 from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
-from pydantic import BaseModel
 
+from pydantic import BaseModel
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@@ -29,8 +30,11 @@
     CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
     CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    find_matched_target, is_activation_quantization_format,
-    should_ignore_layer)
+    find_matched_target,
+    is_activation_quantization_format,
+    should_ignore_layer,
+    gaudi_weight_wrapper,
+)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
 
@@ -581,6 +585,8 @@ def create_weights(self, layer: torch.nn.Module,
         details
         """
         weight_loader = extra_weight_attrs.get("weight_loader")
+        if current_platform.is_hpu() and envs.VLLM_HPU_CONVERT_TO_FP8UZ:
+            weight_loader = gaudi_weight_wrapper(weight_loader)
         layer.scheme.create_weights(
             layer=layer,
             input_size=input_size,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -29,7 +29,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import gaudi_weight_wrapper
 logger = init_logger(__name__)
 
 
@@ -144,6 +144,10 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         params_dtype = torch.float8_e4m3fn
 
         # WEIGHTS
+        if current_platform.is_hpu() and envs.VLLM_HPU_CONVERT_TO_FP8UZ:
+            extra_weight_attrs["weight_loader"] = gaudi_weight_wrapper(
+                extra_weight_attrs.get("weight_loader")
+            )
         w13_weight = torch.nn.Parameter(torch.empty(
             num_experts,
             2 * intermediate_size_per_partition,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
 from typing import Optional
-
+import torch
 import regex as re
 from compressed_tensors import CompressionFormat
 from torch.nn import Module
@@ -213,3 +213,23 @@ def _match_fused_layer(
             unfused_matches.append(None)
 
     return unfused_matches[0] if all(unfused_matches) else None
+
+def gaudi_weight_wrapper(weight_loader):
+    """Wrapper for Gaudi weight conversion."""
+    
+    FP8_SCALE_FACTOR = 2.0
+    def wrapper(*args, **kwargs):
+        # args[0] is parameter, args[1] is loaded_weight
+        # weights will be always in fp8, but scales will be in fp32,
+        # so we can detect it by dtype
+        loaded_weight = args[1]
+        if loaded_weight.dtype == torch.float8_e4m3fn:
+            loaded_weight.data = (
+                loaded_weight.data.float() / FP8_SCALE_FACTOR
+            ).to(torch.float8_e4m3fn)
+        else:
+            loaded_weight.data = (loaded_weight.data * FP8_SCALE_FACTOR)
+        args = (args[0], loaded_weight) + args[2:]
+        weight_loader(*args, **kwargs)
+
+    return wrapper
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
@@ -390,7 +390,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # enable_eplb = vllm_config.parallel_config.enable_eplb
         enable_eplb = False
         self.config = config
-
         self.vocab_size = config.vocab_size
 
         if get_pp_group().is_first_rank:
@@ -511,7 +510,6 @@ def load_weights(self, weights: Iterable[tuple[str,
                     continue
                 if is_pp_missing_parameter(name, self):
                     continue
-
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -533,7 +531,6 @@ def load_weights(self, weights: Iterable[tuple[str,
 
                     if is_pp_missing_parameter(name_mapped, self):
                         continue
-
                     param = params_dict[name_mapped]
                     # We should ask the weight loader to return success or not
                     # here since otherwise we may skip experts with other
@@ -565,7 +562,6 @@ def load_weights(self, weights: Iterable[tuple[str,
 
                     if is_pp_missing_parameter(name, self):
                         continue
-
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)