replace custom ops with paddle ops (#10857)

chen2016013 · phlrain · web-flow · commit 33e01f1e9a82 · 2025-07-17T19:10:25.000+08:00
Co-authored-by: phlrain &lt;phliuhongyu@126.com&gt;
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -70,7 +70,6 @@
 
 
 DSV3_USE_FP8_GEMM = os.getenv("DSV3_USE_FP8_GEMM", "False").lower() == "true"
-DSV3_USE_FP8_DISPATCH = os.getenv("DSV3_USE_FP8_DISPATCH", "False").lower() == "true"
 
 
 def parse_args(args):
diff --git a/paddlenlp/transformers/moe_utils.py b/paddlenlp/transformers/moe_utils.py
@@ -19,11 +19,6 @@
 
 from .fp8_utils import dequantize_fp8_to_fp32
 
-try:
-    import TokenDispatcherUtils as TDU
-except:
-    pass
-
 if not hasattr(paddle.Tensor, "_clear_to_zero_allocation"):
 
     def _clear_to_zero_allocation(self):
@@ -151,17 +146,21 @@ def forward(
         tokens_per_expert,
     ):
         if isinstance(hs_2d_dispatched, tuple):
-            (unzipped_tokens, zipped_expertwise_rowmap, unzipped_probs, unzipped_scale,) = TDU.tokens_unzip_stable(
-                hs_2d_dispatched[0],
-                hs_2d_dispatched[1],
-                dispatched_indices,
-                dispatched_probs,
-                topk=topk,
-                num_experts=num_experts,
-                tokens_per_expert=tokens_per_expert,
-                padding_multiplex=128,
-                fill_output=True,
-            )
+            with paddle.amp.auto_cast(False):
+                (
+                    unzipped_tokens,
+                    zipped_expertwise_rowmap,
+                    unzipped_probs,
+                    unzipped_scale,
+                ) = paddle.nn.functional.moe_permute(
+                    hs_2d_dispatched[0],
+                    hs_2d_dispatched[1],
+                    dispatched_indices,
+                    dispatched_probs,
+                    num_experts=num_experts,
+                    tokens_per_expert=tokens_per_expert,
+                    padding_alignment=128,
+                )
         else:
             with paddle.amp.auto_cast(False):
                 (
@@ -184,16 +183,17 @@ def forward(
 
     @paddle.no_grad()
     def backward(self, dx, hidden_states_out_grad, probs_grad, dispatched_indices, num_experts):
-        weighted_zipped_tokens, probs_grad_zipped = TDU.tokens_zip(
-            dx,
-            self.zipped_expertwise_rowmap,
-            dispatched_indices,
-            probs_grad,
-            total_zipped_tokens=hidden_states_out_grad[0].shape[0]
-            if isinstance(hidden_states_out_grad, tuple)
-            else hidden_states_out_grad.shape[0],
-            num_experts=num_experts,
-        )
+        with paddle.amp.auto_cast(False):
+            weighted_zipped_tokens, probs_grad_zipped = paddle.nn.functional.moe_unpermute(
+                dx,
+                self.zipped_expertwise_rowmap,
+                dispatched_indices,
+                probs_grad,
+                total_zipped_tokens=hidden_states_out_grad[0].shape[0]
+                if isinstance(hidden_states_out_grad, tuple)
+                else hidden_states_out_grad.shape[0],
+                num_experts=num_experts,
+            )
         self.reset_statue()
         return weighted_zipped_tokens, probs_grad_zipped
 
@@ -207,9 +207,10 @@ def __init__(self, token_dispatcher, name="zip"):
     def forward(
         self, expert_out, zipped_expertwise_rowmap, routemap_topk, unzipped_probs, total_zipped_tokens, num_experts
     ):
-        expert_out_zipped, zipped_probs_topk = TDU.tokens_zip(
-            expert_out, zipped_expertwise_rowmap, routemap_topk, unzipped_probs, total_zipped_tokens, num_experts
-        )
+        with paddle.amp.auto_cast(False):
+            expert_out_zipped, zipped_probs_topk = paddle.nn.functional.moe_unpermute(
+                expert_out, zipped_expertwise_rowmap, routemap_topk, unzipped_probs, total_zipped_tokens, num_experts
+            )
         return expert_out_zipped
 
     @paddle.no_grad()
@@ -223,23 +224,22 @@ def backward(
         tokens_per_expert,
     ):
         if isinstance(grad_output, tuple):
-            (
-                unzipped_grad,
-                zipped_expertwise_rowmap_grad,
-                unzipped_probs_grad,
-                unzipped_scale_grad,
-            ) = TDU.tokens_unzip_stable(
-                grad_output[0],
-                grad_output[1],
-                dispatched_indices,
-                dispatched_probs,
-                top_k,
-                num_experts,
-                tokens_per_expert,
-                padding_multiplex=128,
-                fill_output=True,
-            )
-            return (unzipped_grad, unzipped_scale_grad)
+            with paddle.amp.auto_cast(False):
+                (
+                    unzipped_grad,
+                    zipped_expertwise_rowmap_grad,
+                    unzipped_probs_grad,
+                    unzipped_scale_grad,
+                ) = paddle.nn.functional.moe_permute(
+                    grad_output[0],
+                    grad_output[1],
+                    dispatched_indices,
+                    dispatched_probs,
+                    num_experts,
+                    tokens_per_expert,
+                    padding_alignment=128,
+                )
+                return (unzipped_grad, unzipped_scale_grad)
         else:
             with paddle.amp.auto_cast(False):
                 (

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,6 @@`
`70`	`70`
`71`	`71`
`72`	`72`	`DSV3_USE_FP8_GEMM = os.getenv("DSV3_USE_FP8_GEMM", "False").lower() == "true"`
`73`		`-DSV3_USE_FP8_DISPATCH = os.getenv("DSV3_USE_FP8_DISPATCH", "False").lower() == "true"`
`74`	`73`
`75`	`74`
`76`	`75`	`def parse_args(args):`