Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions fastdeploy/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ def __init__(
with_bias: bool = False,
add_bias: bool = False,
skip_quant: bool = False,
weight_dtype="",
):
"""
Initializes a linear layer and provides additional parameters required for inference and quantization.
Expand Down Expand Up @@ -421,6 +422,7 @@ def __init__(
with_bias=with_bias,
add_bias=add_bias,
skip_quant=skip_quant,
weight_dtype=weight_dtype,
)

assert self.quant_method is not None
Expand Down Expand Up @@ -796,6 +798,7 @@ def __init__(
add_bias: bool = False,
reduce_results: bool = True,
skip_quant: bool = False,
weight_dtype="",
):
"""
Initialize a linear layer with additional parameters for inference and quantization.
Expand Down Expand Up @@ -830,6 +833,7 @@ def __init__(
with_bias=with_bias,
add_bias=add_bias,
skip_quant=skip_quant,
weight_dtype=weight_dtype,
)
if add_bias:
assert with_bias, "with_bias must be True when add_bias is True."
Expand Down
12 changes: 6 additions & 6 deletions fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,12 @@ def forward(
return out


@ModelRegistry.register_model_class(
architecture="Ernie4_5_VLMoeForConditionalGeneration",
module_name="ernie4_5_vl.ernie4_5_vl_moe",
category=ModelCategory.MULTIMODAL,
primary_use=ModelCategory.MULTIMODAL,
)
class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
"""
Ernie4_5_VLMoeForConditionalGeneration
Expand Down Expand Up @@ -792,12 +798,6 @@ def clear_grpah_opt_backend(self):
self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config)


@ModelRegistry.register_model_class(
architecture="Ernie4_5_VLMoeForConditionalGeneration",
module_name="ernie4_5_vl.ernie4_5_vl_moe",
category=ModelCategory.MULTIMODAL,
primary_use=ModelCategory.MULTIMODAL,
)
class Ernie4_5_VLPretrainedModel(PretrainedModel):
"""
Ernie4_5_MoePretrainedModel
Expand Down
16 changes: 13 additions & 3 deletions fastdeploy/model_executor/models/interfaces_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ def is_text_generation_model(model_cls: Type[nn.Layer]) -> bool:

def is_pooling_model(model_cls: Type[nn.Layer]) -> bool:
class_name = model_cls.__name__
pooling_indicators = ["Embedding", "ForSequenceClassification"]
pooling_indicators = ["Embedding", "ForSequenceClassification", "Reward"]
return (
any(indicator in class_name for indicator in pooling_indicators)
or hasattr(model_cls, "is_embedding_model")
and model_cls.is_embedding_model
or hasattr(model_cls, "is_pooling_model")
and model_cls.is_pooling_model
)


Expand All @@ -45,10 +45,20 @@ def determine_model_category(class_name: str):
return ModelCategory.MULTIMODAL
elif any(pattern in class_name for pattern in ["Embedding", "ForSequenceClassification"]):
return ModelCategory.EMBEDDING
elif any(pattern in class_name for pattern in ["Reward"]):
return ModelCategory.REWARD
return ModelCategory.TEXT_GENERATION


def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str:
if model_cls is not None:
return getattr(model_cls, "default_pooling_type", "LAST")
return "LAST"


def default_pooling_type(pooling_type: str):
def func(model):
model.default_pooling_type = pooling_type # type: ignore
return model

return func
4 changes: 2 additions & 2 deletions fastdeploy/model_executor/models/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class ModelCategory(Enum):
TEXT_GENERATION = "text_generation"
MULTIMODAL = "multimodal"
EMBEDDING = "embedding"
REWARD = "reward"


@dataclass(frozen=True)
Expand Down Expand Up @@ -228,8 +229,7 @@ def register_model_class(

def _register(model_cls):
# Traditional registration for ModelForCasualLM subclasses
if issubclass(model_cls, ModelForCasualLM) and model_cls is not ModelForCasualLM:
cls._arch_to_model_cls[model_cls.name()] = model_cls
cls._arch_to_model_cls[model_cls.name()] = model_cls

# Enhanced decorator-style registration
if architecture and module_name:
Expand Down
27 changes: 18 additions & 9 deletions fastdeploy/model_executor/models/qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@
ModelForCasualLM,
ModelRegistry,
)
from fastdeploy.model_executor.utils import (
WeightsMapper,
default_weight_loader,
process_weights_after_loading,
process_weights_before_loading,
)


class Qwen2MLP(nn.Layer):
Expand Down Expand Up @@ -316,6 +322,15 @@ def __init__(self, fd_config: FDConfig):
prefix="lm_head",
)

self.process_weights_before_loading_fn = process_weights_before_loading(
skip_prefixes=["lm_head"],
mapper=(
WeightsMapper(orig_to_new_prefix={"model.": "qwen2."})
if self.fd_config.model_config.model_format == "torch"
else None
),
)

@paddle.no_grad()
def load_weights(self, weights_iterator) -> None:
"""
Expand All @@ -325,11 +340,6 @@ def load_weights(self, weights_iterator) -> None:
weights_iterator (Iterator): An iterator yielding (name, weight) pairs.
"""

from fastdeploy.model_executor.utils import (
default_weight_loader,
process_weights_after_loading,
)

stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
Expand All @@ -344,10 +354,9 @@ def load_weights(self, weights_iterator) -> None:
params_dict = dict(self.named_parameters())
process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
for loaded_weight_name, loaded_weight in weights_iterator:
model_format = self.fd_config.model_config.model_format
# Because the prefix for Paddle is qwen2, and for Hugging Face it is model.
if model_format == "torch":
loaded_weight_name = loaded_weight_name.replace("model", "qwen2")
loaded_weight_name = self.process_weights_before_loading_fn(loaded_weight_name)
if loaded_weight_name is None:
continue
for param_name, weight_name, shard_id in stacked_params_mapping:
if weight_name not in loaded_weight_name:
continue
Expand Down
109 changes: 109 additions & 0 deletions fastdeploy/model_executor/models/qwen2_rm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

from __future__ import annotations

import paddle
from paddle import nn

from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.layers.linear import (
ColumnParallelLinear,
RowParallelLinear,
)
from fastdeploy.model_executor.layers.pooler import DispatchPooler, Pooler
from fastdeploy.model_executor.utils import process_weights_before_loading

from .interfaces_base import default_pooling_type
from .model_base import ModelCategory, ModelRegistry
from .qwen2 import Qwen2ForCausalLM, Qwen2Model


class Qwen2RewardBaseModel(nn.Layer):
"""
Qwen2RewardBaseModel
"""

is_pooling_model = True
pooler: Pooler

def __init__(self, fd_config: FDConfig):
super().__init__()
self.model = Qwen2Model(fd_config=fd_config)
self.head_dtype = paddle.float32

self.score = nn.Sequential(
ColumnParallelLinear(
fd_config=fd_config,
input_size=fd_config.model_config.hidden_size,
output_size=fd_config.model_config.hidden_size,
skip_quant=True,
weight_dtype=self.head_dtype,
with_bias=False,
),
nn.ReLU(),
RowParallelLinear(
fd_config=fd_config,
input_size=fd_config.model_config.hidden_size,
output_size=fd_config.model_config.num_labels,
skip_quant=True,
weight_dtype=self.head_dtype,
with_bias=False,
),
)

def forward(
self,
ids_remove_padding: paddle.Tensor,
forward_meta: ForwardMeta,
):
hidden_states = self.model(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta)
hidden_states = hidden_states.to(self.head_dtype)
logits = self.score(hidden_states)
return logits


@ModelRegistry.register_model_class(
architecture="Qwen2ForProcessRewardModel",
module_name="qwen2_rm",
category=[ModelCategory.REWARD],
primary_use=ModelCategory.REWARD,
)
@default_pooling_type("STEP")
class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):

def __init__(self, fd_config: FDConfig):
self.fd_config = fd_config
fd_config.model_config.num_labels = 2
super().__init__(fd_config=fd_config)

pooler_config = fd_config.model_config.pooler_config
assert pooler_config is not None

self.pooler = DispatchPooler({"encode": Pooler.for_encode(pooler_config)})

self.process_weights_before_loading_fn = process_weights_before_loading(skip_prefixes=["lm_head"])

@classmethod
def name(self):
""" """
return "Qwen2ForProcessRewardModel"

@paddle.no_grad()
def load_weights(self, weights_iterator):
# Filter out lm_head weights of Qwen2ForCausalLM
Qwen2ForCausalLM.load_weights(self, weights_iterator)
34 changes: 33 additions & 1 deletion fastdeploy/model_executor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
"""

import re
from collections.abc import Mapping
from contextlib import contextmanager
from typing import Any, Optional, Union
from dataclasses import dataclass, field
from typing import Any, List, Optional, Union

import paddle

Expand Down Expand Up @@ -146,6 +148,36 @@ def fn(model_sublayer_name: str, param=None):
return fn


@dataclass
class WeightsMapper:
orig_to_new_prefix: Mapping[str, Optional[str]] = field(default_factory=dict)

def _map_name(self, key: str) -> Optional[str]:
for prefix, new_key in self.orig_to_new_prefix.items():
if key.startswith(prefix):
key = key.replace(prefix, new_key, 1)
return key

def apply(self, weight_name):
return self._map_name(weight_name)


def process_weights_before_loading(
*, skip_prefixes: Optional[List[str]] = None, mapper: Optional[WeightsMapper] = None
):
def _can_skip(weight_name):
return any(weight_name.startswith(p) for p in skip_prefixes)

def fn(weight_name):
if mapper is not None:
weight_name = mapper.apply(weight_name)
if _can_skip(weight_name):
weight_name = None
return weight_name

return fn


def free_tensor(tensor):
if hasattr(tensor, "tensor_track"):
tensor.tensor_track = None
Expand Down
Loading