diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 83c84454f9..4f7dc90328 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -393,6 +393,7 @@ def __init__( with_bias: bool = False, add_bias: bool = False, skip_quant: bool = False, + weight_dtype="", ): """ Initializes a linear layer and provides additional parameters required for inference and quantization. @@ -421,6 +422,7 @@ def __init__( with_bias=with_bias, add_bias=add_bias, skip_quant=skip_quant, + weight_dtype=weight_dtype, ) assert self.quant_method is not None @@ -796,6 +798,7 @@ def __init__( add_bias: bool = False, reduce_results: bool = True, skip_quant: bool = False, + weight_dtype="", ): """ Initialize a linear layer with additional parameters for inference and quantization. @@ -830,6 +833,7 @@ def __init__( with_bias=with_bias, add_bias=add_bias, skip_quant=skip_quant, + weight_dtype=weight_dtype, ) if add_bias: assert with_bias, "with_bias must be True when add_bias is True." diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 7be6d2b5c5..29fe78ba68 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -548,6 +548,12 @@ def forward( return out +@ModelRegistry.register_model_class( + architecture="Ernie4_5_VLMoeForConditionalGeneration", + module_name="ernie4_5_vl.ernie4_5_vl_moe", + category=ModelCategory.MULTIMODAL, + primary_use=ModelCategory.MULTIMODAL, +) class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): """ Ernie4_5_VLMoeForConditionalGeneration @@ -792,12 +798,6 @@ def clear_grpah_opt_backend(self): self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config) -@ModelRegistry.register_model_class( - architecture="Ernie4_5_VLMoeForConditionalGeneration", - module_name="ernie4_5_vl.ernie4_5_vl_moe", - category=ModelCategory.MULTIMODAL, - primary_use=ModelCategory.MULTIMODAL, -) class Ernie4_5_VLPretrainedModel(PretrainedModel): """ Ernie4_5_MoePretrainedModel diff --git a/fastdeploy/model_executor/models/interfaces_base.py b/fastdeploy/model_executor/models/interfaces_base.py index b7ece5fe69..d9feb56e07 100644 --- a/fastdeploy/model_executor/models/interfaces_base.py +++ b/fastdeploy/model_executor/models/interfaces_base.py @@ -25,11 +25,11 @@ def is_text_generation_model(model_cls: Type[nn.Layer]) -> bool: def is_pooling_model(model_cls: Type[nn.Layer]) -> bool: class_name = model_cls.__name__ - pooling_indicators = ["Embedding", "ForSequenceClassification"] + pooling_indicators = ["Embedding", "ForSequenceClassification", "Reward"] return ( any(indicator in class_name for indicator in pooling_indicators) - or hasattr(model_cls, "is_embedding_model") - and model_cls.is_embedding_model + or hasattr(model_cls, "is_pooling_model") + and model_cls.is_pooling_model ) @@ -45,6 +45,8 @@ def determine_model_category(class_name: str): return ModelCategory.MULTIMODAL elif any(pattern in class_name for pattern in ["Embedding", "ForSequenceClassification"]): return ModelCategory.EMBEDDING + elif any(pattern in class_name for pattern in ["Reward"]): + return ModelCategory.REWARD return ModelCategory.TEXT_GENERATION @@ -52,3 +54,11 @@ def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str: if model_cls is not None: return getattr(model_cls, "default_pooling_type", "LAST") return "LAST" + + +def default_pooling_type(pooling_type: str): + def func(model): + model.default_pooling_type = pooling_type # type: ignore + return model + + return func diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index fddfb4de51..28eb6b7da0 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -39,6 +39,7 @@ class ModelCategory(Enum): TEXT_GENERATION = "text_generation" MULTIMODAL = "multimodal" EMBEDDING = "embedding" + REWARD = "reward" @dataclass(frozen=True) @@ -228,8 +229,7 @@ def register_model_class( def _register(model_cls): # Traditional registration for ModelForCasualLM subclasses - if issubclass(model_cls, ModelForCasualLM) and model_cls is not ModelForCasualLM: - cls._arch_to_model_cls[model_cls.name()] = model_cls + cls._arch_to_model_cls[model_cls.name()] = model_cls # Enhanced decorator-style registration if architecture and module_name: diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index fd51358c5b..958f6d15f9 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -44,6 +44,12 @@ ModelForCasualLM, ModelRegistry, ) +from fastdeploy.model_executor.utils import ( + WeightsMapper, + default_weight_loader, + process_weights_after_loading, + process_weights_before_loading, +) class Qwen2MLP(nn.Layer): @@ -316,6 +322,15 @@ def __init__(self, fd_config: FDConfig): prefix="lm_head", ) + self.process_weights_before_loading_fn = process_weights_before_loading( + skip_prefixes=["lm_head"], + mapper=( + WeightsMapper(orig_to_new_prefix={"model.": "qwen2."}) + if self.fd_config.model_config.model_format == "torch" + else None + ), + ) + @paddle.no_grad() def load_weights(self, weights_iterator) -> None: """ @@ -325,11 +340,6 @@ def load_weights(self, weights_iterator) -> None: weights_iterator (Iterator): An iterator yielding (name, weight) pairs. """ - from fastdeploy.model_executor.utils import ( - default_weight_loader, - process_weights_after_loading, - ) - stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -344,10 +354,9 @@ def load_weights(self, weights_iterator) -> None: params_dict = dict(self.named_parameters()) process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers())) for loaded_weight_name, loaded_weight in weights_iterator: - model_format = self.fd_config.model_config.model_format - # Because the prefix for Paddle is qwen2, and for Hugging Face it is model. - if model_format == "torch": - loaded_weight_name = loaded_weight_name.replace("model", "qwen2") + loaded_weight_name = self.process_weights_before_loading_fn(loaded_weight_name) + if loaded_weight_name is None: + continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in loaded_weight_name: continue diff --git a/fastdeploy/model_executor/models/qwen2_rm.py b/fastdeploy/model_executor/models/qwen2_rm.py new file mode 100644 index 0000000000..4f8f0dacb2 --- /dev/null +++ b/fastdeploy/model_executor/models/qwen2_rm.py @@ -0,0 +1,109 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from __future__ import annotations + +import paddle +from paddle import nn + +from fastdeploy.config import FDConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta +from fastdeploy.model_executor.layers.linear import ( + ColumnParallelLinear, + RowParallelLinear, +) +from fastdeploy.model_executor.layers.pooler import DispatchPooler, Pooler +from fastdeploy.model_executor.utils import process_weights_before_loading + +from .interfaces_base import default_pooling_type +from .model_base import ModelCategory, ModelRegistry +from .qwen2 import Qwen2ForCausalLM, Qwen2Model + + +class Qwen2RewardBaseModel(nn.Layer): + """ + Qwen2RewardBaseModel + """ + + is_pooling_model = True + pooler: Pooler + + def __init__(self, fd_config: FDConfig): + super().__init__() + self.model = Qwen2Model(fd_config=fd_config) + self.head_dtype = paddle.float32 + + self.score = nn.Sequential( + ColumnParallelLinear( + fd_config=fd_config, + input_size=fd_config.model_config.hidden_size, + output_size=fd_config.model_config.hidden_size, + skip_quant=True, + weight_dtype=self.head_dtype, + with_bias=False, + ), + nn.ReLU(), + RowParallelLinear( + fd_config=fd_config, + input_size=fd_config.model_config.hidden_size, + output_size=fd_config.model_config.num_labels, + skip_quant=True, + weight_dtype=self.head_dtype, + with_bias=False, + ), + ) + + def forward( + self, + ids_remove_padding: paddle.Tensor, + forward_meta: ForwardMeta, + ): + hidden_states = self.model(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta) + hidden_states = hidden_states.to(self.head_dtype) + logits = self.score(hidden_states) + return logits + + +@ModelRegistry.register_model_class( + architecture="Qwen2ForProcessRewardModel", + module_name="qwen2_rm", + category=[ModelCategory.REWARD], + primary_use=ModelCategory.REWARD, +) +@default_pooling_type("STEP") +class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): + + def __init__(self, fd_config: FDConfig): + self.fd_config = fd_config + fd_config.model_config.num_labels = 2 + super().__init__(fd_config=fd_config) + + pooler_config = fd_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({"encode": Pooler.for_encode(pooler_config)}) + + self.process_weights_before_loading_fn = process_weights_before_loading(skip_prefixes=["lm_head"]) + + @classmethod + def name(self): + """ """ + return "Qwen2ForProcessRewardModel" + + @paddle.no_grad() + def load_weights(self, weights_iterator): + # Filter out lm_head weights of Qwen2ForCausalLM + Qwen2ForCausalLM.load_weights(self, weights_iterator) diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 754725691e..2f06f205b6 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -15,8 +15,10 @@ """ import re +from collections.abc import Mapping from contextlib import contextmanager -from typing import Any, Optional, Union +from dataclasses import dataclass, field +from typing import Any, List, Optional, Union import paddle @@ -146,6 +148,36 @@ def fn(model_sublayer_name: str, param=None): return fn +@dataclass +class WeightsMapper: + orig_to_new_prefix: Mapping[str, Optional[str]] = field(default_factory=dict) + + def _map_name(self, key: str) -> Optional[str]: + for prefix, new_key in self.orig_to_new_prefix.items(): + if key.startswith(prefix): + key = key.replace(prefix, new_key, 1) + return key + + def apply(self, weight_name): + return self._map_name(weight_name) + + +def process_weights_before_loading( + *, skip_prefixes: Optional[List[str]] = None, mapper: Optional[WeightsMapper] = None +): + def _can_skip(weight_name): + return any(weight_name.startswith(p) for p in skip_prefixes) + + def fn(weight_name): + if mapper is not None: + weight_name = mapper.apply(weight_name) + if _can_skip(weight_name): + weight_name = None + return weight_name + + return fn + + def free_tensor(tensor): if hasattr(tensor, "tensor_track"): tensor.tensor_track = None