PaddlePaddle
diff --git a/‎fastdeploy/config.py
Lines changed: 1 addition & 1 deletion b/‎fastdeploy/config.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
Lines changed: 14 additions & 26 deletions b/‎fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
Lines changed: 14 additions & 26 deletions
diff --git a/‎fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py
Lines changed: 9 additions & 1 deletion b/‎fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py
Lines changed: 9 additions & 1 deletion
diff --git a/‎fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
Lines changed: 7 additions & 1 deletion b/‎fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
Lines changed: 7 additions & 1 deletion
@@ -663,7 +663,7 @@ class LoadChoices(str, Enum):
 
     DEFAULT = "default"
     # only support qwen3-bf16 now
-    NEW_LOADER = "new_loader"
+    DEFAULT_V1 = "default_v1"
 
 
 class LoadConfig:
 
@@ -22,7 +22,9 @@
 from paddle import nn
 from paddleformers.utils.log import logger
 
-from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import MoEMethodBase
+from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import (
+    UnquantizedFusedMoEMethod,
+)
 from fastdeploy.model_executor.layers.utils import (
     CpuGuard,
     create_and_set_parameter,
@@ -37,7 +39,7 @@
 )
 
 
-class GCUFusedMoeMethod(MoEMethodBase):
+class GCUFusedMoeMethod(UnquantizedFusedMoEMethod):
     """
     Use GCU to compute Fused MoE.
     """
@@ -46,28 +48,12 @@ def __init__(self, quant_config):
         super().__init__(quant_config)
         self.group_size = -1
 
-    def create_weights(self, layer: nn.Layer, state_dict):
-        """
-        Paddle gcu create weight process.
-        """
-        # bf16
+    def process_loaded_weights(self, layer: nn.Layer, state_dict):
         up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
         stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
         stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
-        for idx, weight_tensor in enumerate([stacked_up_gate_proj_weights, stacked_down_proj_weights]):
-            # shape [E, K, N] -> [E, N, K]
-            weight_tensor = paddle.transpose(weight_tensor, [0, 2, 1])
-            weight_name = self.added_weight_attrs[idx]
-            setattr(
-                layer,
-                weight_name,
-                layer.create_parameter(
-                    shape=weight_tensor.shape,
-                    dtype=weight_tensor.dtype,
-                    default_initializer=paddle.nn.initializer.Constant(0),
-                ),
-            )
-            getattr(layer, weight_name).set_value(weight_tensor)
+        layer.up_gate_proj_weight.set_value(paddle.transpose(stacked_up_gate_proj_weights, [0, 2, 1]))
+        layer.down_proj_weight.set_value(paddle.transpose(stacked_down_proj_weights, [0, 2, 1]))
 
     @paddle.no_grad()
     def compute_ffn(
@@ -202,18 +188,19 @@ def apply(
         self,
         layer: nn.Layer,
         x: paddle.Tensor,
-        gate_out: paddle.Tensor,
+        gate: nn.Layer,
     ) -> paddle.Tensor:
         """
         Paddle gcu compute Fused MoE.
         """
+        gate_out = gate(x.cast("float32"))
         return self.compute_ffn(layer, x, gate_out, enable_quant=False)
 
     def apply_ep_prefill(
         self,
         layer: nn.Layer,
         x: paddle.Tensor,
-        gate_out: paddle.Tensor,
+        gate: nn.Layer,
     ) -> paddle.Tensor:
         """
         Apply the EP prefill method.
@@ -224,7 +211,7 @@ def apply_ep_decode(
         self,
         layer: nn.Layer,
         x: paddle.Tensor,
-        gate_out: paddle.Tensor,
+        gate: nn.Layer,
     ) -> paddle.Tensor:
         """
         Apply the EP decoder method.
@@ -235,7 +222,7 @@ def apply_tp(
         self,
         layer: nn.Layer,
         x: paddle.Tensor,
-        gate_out: paddle.Tensor,
+        gate: nn.Layer,
     ) -> paddle.Tensor:
         """
         Paddle Cutlass compute Fused MoE.
@@ -400,9 +387,10 @@ def apply(
         self,
         layer: nn.Layer,
         x: paddle.Tensor,
-        gate_out: paddle.Tensor,
+        gate: nn.Layer,
     ) -> paddle.Tensor:
         """
         Paddle gcu compute Fused MoE.
         """
+        gate_out = gate(x.cast("float32"))
         return self.compute_ffn(layer, x, gate_out, enable_quant=True)
@@ -37,14 +37,22 @@ def __init__(
         self.quant_config = quant_config
         self.group_size = -1
 
-    def create_weights(self, layer):
+    def create_weights(self, layer, **extra_weight_attrs):
         # The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
         weight_scale_shape = [layer.weight_shape[1]]
 
         layer.weight_shape.reverse()
         if self.quant_config.name() == "wint4":
             layer.weight_shape[0] //= 2
         layer.weight_dtype = "int8"
+
+        layer.weight = layer.create_parameter(
+            shape=layer.weight_shape,
+            dtype=layer.weight_dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+
         layer.weight_scale = layer.create_parameter(
             shape=weight_scale_shape,
             dtype=layer._dtype,
 
@@ -35,7 +35,7 @@ def __init__(
     ) -> None:
         super().__init__(quant_config)
 
-    def create_weights(self, layer: nn.Layer) -> None:
+    def create_weights(self, layer: nn.Layer, **extra_weight_attrs) -> None:
         """
         Create weights for linear layer on XPU
         """
@@ -45,6 +45,12 @@ def create_weights(self, layer: nn.Layer) -> None:
         if self.quant_config.name() == "weight_only_int4":
             layer.weight_shape[0] //= 2
         layer.weight_dtype = "int8"
+        layer.weight = layer.create_parameter(
+            shape=layer.weight_shape,
+            dtype=layer.weight_dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
         layer.weight_scale = layer.create_parameter(
             shape=weight_scale_shape,
             dtype="float32",