Speedup online convert (#1772)

yiliu30 · ranzhejiang · Yi · web-flow · commit 8b03c85b13c1 · 2025-08-20T13:13:55.000+08:00
Porting the left part of #1505 - GLM-4.5-Air-FP8( ~100B), 600s -> 60.50 s - For DS R1(~600B), it requires about 430s w/ that fix @czhu15 @yangulei Please help to review, thanks! cc @thuang6 --------- Signed-off-by: yiliu30 <yi4.liu@intel.com> Co-authored-by: ranzhejiang <zhejiang.ran@intel.com> Co-authored-by: Yi <yi4.liu4@intel.com>
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -33,8 +33,8 @@
     find_matched_target,
     is_activation_quantization_format,
     should_ignore_layer,
-    gaudi_weight_wrapper,
 )
+from vllm.model_executor.model_loader.weight_utils import gaudi_weight_wrapper
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -29,7 +29,8 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import gaudi_weight_wrapper
+from vllm.model_executor.model_loader.weight_utils import gaudi_weight_wrapper
+
 logger = init_logger(__name__)
 
 
@@ -146,8 +147,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         # WEIGHTS
         if current_platform.is_hpu() and envs.VLLM_HPU_CONVERT_TO_FP8UZ:
             extra_weight_attrs["weight_loader"] = gaudi_weight_wrapper(
-                extra_weight_attrs.get("weight_loader")
-            )
+                extra_weight_attrs.get("weight_loader"))
         w13_weight = torch.nn.Parameter(torch.empty(
             num_experts,
             2 * intermediate_size_per_partition,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -4,7 +4,6 @@
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
 from typing import Optional
-import torch
 import regex as re
 from compressed_tensors import CompressionFormat
 from torch.nn import Module
@@ -213,23 +212,3 @@ def _match_fused_layer(
             unfused_matches.append(None)
 
     return unfused_matches[0] if all(unfused_matches) else None
-
-def gaudi_weight_wrapper(weight_loader):
-    """Wrapper for Gaudi weight conversion."""
-    
-    FP8_SCALE_FACTOR = 2.0
-    def wrapper(*args, **kwargs):
-        # args[0] is parameter, args[1] is loaded_weight
-        # weights will be always in fp8, but scales will be in fp32,
-        # so we can detect it by dtype
-        loaded_weight = args[1]
-        if loaded_weight.dtype == torch.float8_e4m3fn:
-            loaded_weight.data = (
-                loaded_weight.data.float() / FP8_SCALE_FACTOR
-            ).to(torch.float8_e4m3fn)
-        else:
-            loaded_weight.data = (loaded_weight.data * FP8_SCALE_FACTOR)
-        args = (args[0], loaded_weight) + args[2:]
-        weight_loader(*args, **kwargs)
-
-    return wrapper
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -39,7 +39,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-
+from vllm.model_executor.model_loader.weight_utils import gaudi_weight_wrapper
 if current_platform.is_hpu():
     import vllm_hpu_extension.ops as hpu_ops
     from vllm_hpu_extension.ops import scaled_fp8_quant
@@ -228,7 +228,7 @@ def create_weights(
         layer.orig_dtype = params_dtype
         layer.weight_block_size = None
         if current_platform.is_hpu() and envs.VLLM_HPU_CONVERT_TO_FP8UZ:
-            weight_loader = self._gaudi_weight_wrapper(weight_loader)
+            weight_loader = gaudi_weight_wrapper(weight_loader)
 
         if self.block_quant:
             tp_size = get_tensor_model_parallel_world_size()
@@ -312,25 +312,6 @@ def create_weights(
             else:
                 layer.register_parameter("input_scale", None)
 
-    def _gaudi_weight_wrapper(self, weight_loader):
-        """Wrapper for Gaudi weight conversion."""
-
-        def wrapper(*args, **kwargs):
-            # args[0] is parameter, args[1] is loaded_weight
-            # weights will be always in fp8, but scales will be in fp32,
-            # so we can detect it by dtype
-            loaded_weight = args[1]
-            if loaded_weight.dtype == torch.float8_e4m3fn:
-                loaded_weight = (loaded_weight.float() * 0.5).to(
-                    torch.float8_e4m3fn)
-            else:
-                loaded_weight = (loaded_weight.data * 2.0)
-            args = (args[0], loaded_weight) + args[2:]
-
-            weight_loader(*args, **kwargs)
-
-        return wrapper
-
     def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
         # Pad the weight tensor. This is an optimization on ROCm platform, which
         # can benefit from tensors located far enough from one another in memory
@@ -541,7 +522,7 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
         layer.weight_block_size = None
         layer.weight_block_size = None
         if current_platform.is_hpu() and envs.VLLM_HPU_CONVERT_TO_FP8UZ:
-            extra_weight_attrs["weight_loader"] = self._gaudi_weight_wrapper(
+            extra_weight_attrs["weight_loader"] = gaudi_weight_wrapper(
                 extra_weight_attrs.get("weight_loader"))
         layer.quant_config = self.quant_config
         if self.quant_config.is_checkpoint_fp8_serialized:
@@ -662,24 +643,6 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
             layer.w13_input_scale = None
             layer.w2_input_scale = None
 
-    def _gaudi_weight_wrapper(self, weight_loader):
-        """Wrapper for Gaudi weight conversion."""
-
-        def wrapper(*args, **kwargs):
-            # args[0] is parameter, args[1] is loaded_weight
-            # weights will be always in fp8, but scales will be in fp32,
-            # so we can detect it by dtype
-            loaded_weight = args[1]
-            if loaded_weight.dtype == torch.float8_e4m3fn:
-                loaded_weight.data = (loaded_weight.data.float() * 0.5).to(
-                    torch.float8_e4m3fn)
-            else:
-                loaded_weight.data = (loaded_weight.data * 2.0)
-            args = (args[0], loaded_weight) + args[2:]
-            weight_loader(*args, **kwargs)
-
-        return wrapper
-
     def process_weights_after_loading(self, layer: Module) -> None:
         # Lazy import to avoid importing triton too early.
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
@@ -12,7 +12,7 @@
 from collections.abc import Generator
 from pathlib import Path
 from typing import Any, Callable, Optional, Union
-
+from functools import wraps
 import filelock
 import gguf
 import huggingface_hub.constants
@@ -29,6 +29,7 @@
                                                      get_quantization_config)
 from vllm.platforms import current_platform
 from vllm.utils import PlaceholderModule
+import vllm.envs as envs
 
 try:
     from runai_model_streamer import SafetensorsStreamer
@@ -788,3 +789,69 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
 
     # If there were no matches, return the untouched param name
     return name
+
+
+def gaudi_weight_wrapper(weight_loader):
+    """Wrapper for Gaudi weight conversion."""
+
+    FP8_SCALE_FACTOR = 2.0
+
+    def wrapper(*args, **kwargs):
+        # args[0] is parameter, args[1] is loaded_weight
+        # weights will be always in fp8, but scales will be in fp32,
+        # so we can detect it by dtype
+        loaded_weight = args[1]
+        if loaded_weight.dtype == torch.float8_e4m3fn:
+            loaded_weight.data = (loaded_weight.data.float() /
+                                  FP8_SCALE_FACTOR).to(torch.float8_e4m3fn)
+        else:
+            loaded_weight.data = (loaded_weight.data * FP8_SCALE_FACTOR)
+        args = (args[0], loaded_weight) + args[2:]
+        weight_loader(*args, **kwargs)
+
+    return wrapper
+
+
+def with_thread_limits(div_omp: int = 4, div_torch: int = 8):
+    """
+    Decorator to temporarily set OMP_NUM_THREADS and PyTorch threads,
+    and restore them after the function call.
+    
+    Args:
+        div_omp: divide CPU cores by this for OMP_NUM_THREADS
+        div_torch: divide CPU cores by this for torch.set_num_threads
+    """
+
+    def decorator(func):
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            if not (current_platform.is_hpu()
+                    and envs.VLLM_HPU_CONVERT_TO_FP8UZ):
+                return func(*args, **kwargs)
+
+            # Save original settings
+            old_omp = os.environ.get("OMP_NUM_THREADS", None)
+            old_torch = torch.get_num_threads()
+            num_cores = os.cpu_count() or 1
+
+            # Set new limits
+            os.environ["OMP_NUM_THREADS"] = str(max(1, num_cores // div_omp))
+            torch.set_num_threads(max(1, num_cores // div_torch))
+            logger.warning_once(
+                "Setting OMP_NUM_THREADS to %s and torch.set_num_threads to %s",
+                os.environ["OMP_NUM_THREADS"], torch.get_num_threads())
+            try:
+                # Call the actual function
+                return func(*args, **kwargs)
+            finally:
+                # Restore original settings
+                if old_omp is None:
+                    os.environ.pop("OMP_NUM_THREADS", None)
+                else:
+                    os.environ["OMP_NUM_THREADS"] = old_omp
+                torch.set_num_threads(old_torch)
+
+        return wrapper
+
+    return decorator
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -47,7 +47,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader, maybe_remap_kv_scale_name, with_thread_limits)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -781,6 +781,7 @@ def make_empty_intermediate_tensors(
                         device=device),
         })
 
+    @with_thread_limits()
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
@@ -796,12 +797,6 @@ def load_weights(self, weights: Iterable[tuple[str,
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
             num_experts=self.config.n_routed_experts)
-        if current_platform.is_hpu():
-            old_num_threads = torch.get_num_threads()
-            import os
-            num_cores = os.cpu_count()
-            os.environ["OMP_NUM_THREADS"] = str(max(1, num_cores // 4))
-            torch.set_num_threads(max(1, num_cores // 8))
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
@@ -873,10 +868,6 @@ def load_weights(self, weights: Iterable[tuple[str,
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
-        if current_platform.is_hpu():
-            # Restore the number of threads for HPU.
-            torch.set_num_threads(old_num_threads)
-            os.environ["OMP_NUM_THREADS"] = str(old_num_threads)
         return loaded_params
 
 
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
@@ -32,7 +32,7 @@
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_ep_group, get_pp_group,
                               get_tensor_model_parallel_world_size)
 from vllm.logger import init_logger
@@ -49,7 +49,10 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+    with_thread_limits,
+)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -130,11 +133,9 @@ def __init__(
             torch.empty(config.n_routed_experts, dtype=torch.float32))
 
         # Load balancing settings.
-        vllm_config = get_current_vllm_config()
-        parallel_config = vllm_config.parallel_config
         self.enable_eplb = enable_eplb
 
-        # Comment code below until we rebase to latset vllm
+        # Comment code below until we rebase to latest vllm
         # self.n_redundant_experts = parallel_config.num_redundant_experts
         self.n_redundant_experts = 0
         self.n_logical_experts = self.n_routed_experts
@@ -161,10 +162,9 @@ def __init__(
             prefix=f"{prefix}.experts",
             scoring_func="sigmoid",
             e_score_correction_bias=self.gate.e_score_correction_bias,
-            # Comment code below until we rebase to latset vllm
+            # Comment code below until we rebase to latest vllm
             # enable_eplb=self.enable_eplb,
             # num_redundant_experts=self.n_redundant_experts
-
         )
 
         if config.n_shared_experts is not None:
@@ -386,7 +386,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        # comment code below until we rebase to latset vllm
+        # comment code below until we rebase to latest vllm
         # enable_eplb = vllm_config.parallel_config.enable_eplb
         enable_eplb = False
         self.config = config
@@ -673,6 +673,7 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
+    @with_thread_limits()
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)

Original file line number	Diff line number	Diff line change
`@@ -33,8 +33,8 @@`
`33`	`33`	`find_matched_target,`
`34`	`34`	`is_activation_quantization_format,`
`35`	`35`	`should_ignore_layer,`
`36`		`- gaudi_weight_wrapper,`
`37`	`36`	`)`
	`37`	`+from vllm.model_executor.model_loader.weight_utils import gaudi_weight_wrapper`
`38`	`38`	`from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod`
`39`	`39`	`from vllm.platforms import current_platform`
`40`	`40`