PaddlePaddle · bukejiyu · Sep 29, 2025 · Sep 29, 2025
diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
@@ -393,6 +393,7 @@ def __init__(
         with_bias: bool = False,
         add_bias: bool = False,
         skip_quant: bool = False,
+        weight_dtype="",
     ):
         """
         Initializes a linear layer and provides additional parameters required for inference and quantization.
@@ -421,6 +422,7 @@ def __init__(
             with_bias=with_bias,
             add_bias=add_bias,
             skip_quant=skip_quant,
+            weight_dtype=weight_dtype,
         )
 
         assert self.quant_method is not None
@@ -796,6 +798,7 @@ def __init__(
         add_bias: bool = False,
         reduce_results: bool = True,
         skip_quant: bool = False,
+        weight_dtype="",
     ):
         """
         Initialize a linear layer with additional parameters for inference and quantization.
@@ -830,6 +833,7 @@ def __init__(
             with_bias=with_bias,
             add_bias=add_bias,
             skip_quant=skip_quant,
+            weight_dtype=weight_dtype,
         )
         if add_bias:
             assert with_bias, "with_bias must be True when add_bias is True."

diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -548,6 +548,12 @@ def forward(
         return out
 
 
+@ModelRegistry.register_model_class(
+    architecture="Ernie4_5_VLMoeForConditionalGeneration",
+    module_name="ernie4_5_vl.ernie4_5_vl_moe",
+    category=ModelCategory.MULTIMODAL,
+    primary_use=ModelCategory.MULTIMODAL,
+)
 class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
     """
     Ernie4_5_VLMoeForConditionalGeneration
@@ -792,12 +798,6 @@ def clear_grpah_opt_backend(self):
         self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config)
 
 
-@ModelRegistry.register_model_class(
-    architecture="Ernie4_5_VLMoeForConditionalGeneration",
-    module_name="ernie4_5_vl.ernie4_5_vl_moe",
-    category=ModelCategory.MULTIMODAL,
-    primary_use=ModelCategory.MULTIMODAL,
-)
 class Ernie4_5_VLPretrainedModel(PretrainedModel):
     """
     Ernie4_5_MoePretrainedModel

diff --git a/fastdeploy/model_executor/models/interfaces_base.py b/fastdeploy/model_executor/models/interfaces_base.py
@@ -25,11 +25,11 @@ def is_text_generation_model(model_cls: Type[nn.Layer]) -> bool:
 
 def is_pooling_model(model_cls: Type[nn.Layer]) -> bool:
     class_name = model_cls.__name__
-    pooling_indicators = ["Embedding", "ForSequenceClassification"]
+    pooling_indicators = ["Embedding", "ForSequenceClassification", "Reward"]
     return (
         any(indicator in class_name for indicator in pooling_indicators)
-        or hasattr(model_cls, "is_embedding_model")
-        and model_cls.is_embedding_model
+        or hasattr(model_cls, "is_pooling_model")
+        and model_cls.is_pooling_model
     )
 
 
@@ -45,10 +45,20 @@ def determine_model_category(class_name: str):
         return ModelCategory.MULTIMODAL
     elif any(pattern in class_name for pattern in ["Embedding", "ForSequenceClassification"]):
         return ModelCategory.EMBEDDING
+    elif any(pattern in class_name for pattern in ["Reward"]):
+        return ModelCategory.REWARD
     return ModelCategory.TEXT_GENERATION
 
 
 def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str:
     if model_cls is not None:
         return getattr(model_cls, "default_pooling_type", "LAST")
     return "LAST"
+
+
+def default_pooling_type(pooling_type: str):
+    def func(model):
+        model.default_pooling_type = pooling_type  # type: ignore
+        return model
+
+    return func
diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py
@@ -39,6 +39,7 @@ class ModelCategory(Enum):
     TEXT_GENERATION = "text_generation"
     MULTIMODAL = "multimodal"
     EMBEDDING = "embedding"
+    REWARD = "reward"
 
 
 @dataclass(frozen=True)
@@ -228,8 +229,7 @@ def register_model_class(
 
         def _register(model_cls):
             # Traditional registration for ModelForCasualLM subclasses
-            if issubclass(model_cls, ModelForCasualLM) and model_cls is not ModelForCasualLM:
-                cls._arch_to_model_cls[model_cls.name()] = model_cls
+            cls._arch_to_model_cls[model_cls.name()] = model_cls
 
             # Enhanced decorator-style registration
             if architecture and module_name:

diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py
@@ -44,6 +44,12 @@
     ModelForCasualLM,
     ModelRegistry,
 )
+from fastdeploy.model_executor.utils import (
+    WeightsMapper,
+    default_weight_loader,
+    process_weights_after_loading,
+    process_weights_before_loading,
+)
 
 
 class Qwen2MLP(nn.Layer):
@@ -316,6 +322,15 @@ def __init__(self, fd_config: FDConfig):
             prefix="lm_head",
         )
 
+        self.process_weights_before_loading_fn = process_weights_before_loading(
+            skip_prefixes=["lm_head"],
+            mapper=(
+                WeightsMapper(orig_to_new_prefix={"model.": "qwen2."})
+                if self.fd_config.model_config.model_format == "torch"
+                else None
+            ),
+        )
+
     @paddle.no_grad()
     def load_weights(self, weights_iterator) -> None:
         """
@@ -325,11 +340,6 @@ def load_weights(self, weights_iterator) -> None:
             weights_iterator (Iterator): An iterator yielding (name, weight) pairs.
         """
 
-        from fastdeploy.model_executor.utils import (
-            default_weight_loader,
-            process_weights_after_loading,
-        )
-
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -344,10 +354,9 @@ def load_weights(self, weights_iterator) -> None:
         params_dict = dict(self.named_parameters())
         process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
         for loaded_weight_name, loaded_weight in weights_iterator:
-            model_format = self.fd_config.model_config.model_format
-            # Because the prefix for Paddle is qwen2, and for Hugging Face it is model.
-            if model_format == "torch":
-                loaded_weight_name = loaded_weight_name.replace("model", "qwen2")
+            loaded_weight_name = self.process_weights_before_loading_fn(loaded_weight_name)
+            if loaded_weight_name is None:
+                continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
                     continue

diff --git a/fastdeploy/model_executor/models/qwen2_rm.py b/fastdeploy/model_executor/models/qwen2_rm.py
@@ -0,0 +1,109 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import annotations
+
+import paddle
+from paddle import nn
+
+from fastdeploy.config import FDConfig
+from fastdeploy.model_executor.forward_meta import ForwardMeta
+from fastdeploy.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+from fastdeploy.model_executor.layers.pooler import DispatchPooler, Pooler
+from fastdeploy.model_executor.utils import process_weights_before_loading
+
+from .interfaces_base import default_pooling_type
+from .model_base import ModelCategory, ModelRegistry
+from .qwen2 import Qwen2ForCausalLM, Qwen2Model
+
+
+class Qwen2RewardBaseModel(nn.Layer):
+    """
+    Qwen2RewardBaseModel
+    """
+
+    is_pooling_model = True
+    pooler: Pooler
+
+    def __init__(self, fd_config: FDConfig):
+        super().__init__()
+        self.model = Qwen2Model(fd_config=fd_config)
+        self.head_dtype = paddle.float32
+
+        self.score = nn.Sequential(
+            ColumnParallelLinear(
+                fd_config=fd_config,
+                input_size=fd_config.model_config.hidden_size,
+                output_size=fd_config.model_config.hidden_size,
+                skip_quant=True,
+                weight_dtype=self.head_dtype,
+                with_bias=False,
+            ),
+            nn.ReLU(),
+            RowParallelLinear(
+                fd_config=fd_config,
+                input_size=fd_config.model_config.hidden_size,
+                output_size=fd_config.model_config.num_labels,
+                skip_quant=True,
+                weight_dtype=self.head_dtype,
+                with_bias=False,
+            ),
+        )
+
+    def forward(
+        self,
+        ids_remove_padding: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        hidden_states = self.model(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta)
+        hidden_states = hidden_states.to(self.head_dtype)
+        logits = self.score(hidden_states)
+        return logits
+
+
+@ModelRegistry.register_model_class(
+    architecture="Qwen2ForProcessRewardModel",
+    module_name="qwen2_rm",
+    category=[ModelCategory.REWARD],
+    primary_use=ModelCategory.REWARD,
+)
+@default_pooling_type("STEP")
+class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
+
+    def __init__(self, fd_config: FDConfig):
+        self.fd_config = fd_config
+        fd_config.model_config.num_labels = 2
+        super().__init__(fd_config=fd_config)
+
+        pooler_config = fd_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({"encode": Pooler.for_encode(pooler_config)})
+
+        self.process_weights_before_loading_fn = process_weights_before_loading(skip_prefixes=["lm_head"])
+
+    @classmethod
+    def name(self):
+        """ """
+        return "Qwen2ForProcessRewardModel"
+
+    @paddle.no_grad()
+    def load_weights(self, weights_iterator):
+        # Filter out lm_head weights of Qwen2ForCausalLM
+        Qwen2ForCausalLM.load_weights(self, weights_iterator)
diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py
@@ -15,8 +15,10 @@
 """
 
 import re
+from collections.abc import Mapping
 from contextlib import contextmanager
-from typing import Any, Optional, Union
+from dataclasses import dataclass, field
+from typing import Any, List, Optional, Union
 
 import paddle
 
@@ -146,6 +148,36 @@ def fn(model_sublayer_name: str, param=None):
     return fn
 
 
+@dataclass
+class WeightsMapper:
+    orig_to_new_prefix: Mapping[str, Optional[str]] = field(default_factory=dict)
+
+    def _map_name(self, key: str) -> Optional[str]:
+        for prefix, new_key in self.orig_to_new_prefix.items():
+            if key.startswith(prefix):
+                key = key.replace(prefix, new_key, 1)
+        return key
+
+    def apply(self, weight_name):
+        return self._map_name(weight_name)
+
+
+def process_weights_before_loading(
+    *, skip_prefixes: Optional[List[str]] = None, mapper: Optional[WeightsMapper] = None
+):
+    def _can_skip(weight_name):
+        return any(weight_name.startswith(p) for p in skip_prefixes)
+
+    def fn(weight_name):
+        if mapper is not None:
+            weight_name = mapper.apply(weight_name)
+        if _can_skip(weight_name):
+            weight_name = None
+        return weight_name
+
+    return fn
+
+
 def free_tensor(tensor):
     if hasattr(tensor, "tensor_track"):
         tensor.tensor_track = None