algorithm1832
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/paddle/device/__init__.py‎
Lines changed: 6 additions & 4 deletions b/‎python/paddle/device/__init__.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎python/paddle/distributed/auto_parallel/intermediate/context_parallel.py‎
Lines changed: 75 additions & 69 deletions b/‎python/paddle/distributed/auto_parallel/intermediate/context_parallel.py‎
Lines changed: 75 additions & 69 deletions
diff --git a/‎python/paddle/distributed/auto_parallel/static/cost/op_runtime_cost.py‎
Lines changed: 10 additions & 4 deletions b/‎python/paddle/distributed/auto_parallel/static/cost/op_runtime_cost.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py‎
Lines changed: 24 additions & 10 deletions b/‎python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py‎
Lines changed: 24 additions & 10 deletions
diff --git a/‎python/paddle/incubate/nn/functional/fused_transformer.py‎
Lines changed: 6 additions & 6 deletions b/‎python/paddle/incubate/nn/functional/fused_transformer.py‎
Lines changed: 6 additions & 6 deletions
@@ -10,6 +10,7 @@ target-version = "py39"
 [tool.ruff.format]
 # Prevent change to double quotes by some users use ruff format
 quote-style = "preserve"
+docstring-code-format = true
 
 [tool.ruff.lint]
 select = [
 
@@ -697,9 +697,11 @@ def get_default_device() -> paddle.device:
     Returns:
         str: The default device for PaddlePaddle.
     Example:
-        .. code-block:: python
-            import paddle
-            print(paddle.get_default_device())
+        .. code-block:: pycon
+
+            >>> import paddle
+
+            >>> print(paddle.get_default_device())
     """
     return paddle.device(get_device().replace("gpu", "cuda"))
 
@@ -1093,7 +1095,7 @@ class Event:
         ```python
         # New usage
         paddle.set_device("gpu:0")  # Set device first
-        e = paddle.device.Event()   # Will use gpu:0
+        e = paddle.device.Event()  # Will use gpu:0
         ```
 
         paddle.device.Event is equivalent to paddle.cuda.Event.
 
@@ -40,74 +40,78 @@ class PrepareContextParallel(PlanBase):
         backend (string): select strategy for context parallel, now support 'p2p' and 'all2all'.
 
     Examples:
-        .. code-block:: python
-
-        >>> import paddle
-        >>> import paddle.distributed as dist
-
-        >>> class SDPALayer(paddle.nn.Layer):
-        ...     def __init__(self):
-        ...         super().__init__()
-        ...
-        ...     def forward(self, q, k, v):
-        ...         return paddle.nn.functional.scaled_dot_product_attention(q, k, v)
-        >>>
-        >>> class AttentionLayer(paddle.nn.Layer):
-        ...     def __init__(self):
-        ...         super().__init__()
-        ...         self.hidden_size = 64
-        ...         self.num_key_value_heads = 10
-        ...         self.head_dim = 64
-        ...         self.sdpa = SDPALayer()
-        ...         self.q = paddle.nn.Linear(
-        ...             self.hidden_size,
-        ...             self.hidden_size,
-        ...             bias_attr=False,
-        ...         )
-        ...         self.k = paddle.nn.Linear(
-        ...             self.hidden_size,
-        ...             self.num_key_value_heads * self.head_dim,
-        ...             bias_attr=False,
-        ...         )
-        ...         self.v = paddle.nn.Linear(
-        ...             self.hidden_size,
-        ...             self.num_key_value_heads * self.head_dim,
-        ...             bias_attr=False,
-        ...         )
-        ...
-        ...     def forward(self, input):
-        ...         q = self.q(input)
-        ...         k = self.k(input)
-        ...         v = self.v(input)
-        ...         return self.sdpa(q, k, v)
-        >>>
-        >>> class LlamaLayer(paddle.nn.Layer):
-        ...     def __init__(self):
-        ...         super().__init__()
-        ...         self.attention = AttentionLayer()
-        ...
-        ...     def forward(self, input, label):
-        ...         return self.attention(input)
-        >>>
-        >>> class LlamaForCausalLayer(paddle.nn.Layer):
-        ...     def __init__(self):
-        ...         super().__init__()
-        ...         self.llama = LlamaLayer()
-        ...         self.weight = self.create_parameter(shape=[64, 1024])
-        ...         self.loss_func = paddle.nn.CrossEntropyLoss()
-        ...
-        ...     def forward(self, input, label):
-        ...         out = self.llama(input, label)
-        ...         logits = paddle.matmul(out, self.weight)
-        ...         loss = self.loss_func(logits, label)
-        ...         return logits
-        >>>
-        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
-        >>> layer = LlamaForCausalLayer()
-        >>> mp_config = {
-        ...     'llama': dist.PrepareContextParallel('p2p'),
-        ...     'sdpa': dist.ContextParallel('p2p'),
-        ... }
+        .. code-block:: pycon
+
+            >>> import paddle
+            >>> import paddle.distributed as dist
+
+            >>> class SDPALayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...
+            ...     def forward(self, q, k, v):
+            ...         return (
+            ...             paddle.nn.functional.scaled_dot_product_attention(
+            ...                 q, k, v
+            ...             )
+            ...         )
+            >>>
+            >>> class AttentionLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.hidden_size = 64
+            ...         self.num_key_value_heads = 10
+            ...         self.head_dim = 64
+            ...         self.sdpa = SDPALayer()
+            ...         self.q = paddle.nn.Linear(
+            ...             self.hidden_size,
+            ...             self.hidden_size,
+            ...             bias_attr=False,
+            ...         )
+            ...         self.k = paddle.nn.Linear(
+            ...             self.hidden_size,
+            ...             self.num_key_value_heads * self.head_dim,
+            ...             bias_attr=False,
+            ...         )
+            ...         self.v = paddle.nn.Linear(
+            ...             self.hidden_size,
+            ...             self.num_key_value_heads * self.head_dim,
+            ...             bias_attr=False,
+            ...         )
+            ...
+            ...     def forward(self, input):
+            ...         q = self.q(input)
+            ...         k = self.k(input)
+            ...         v = self.v(input)
+            ...         return self.sdpa(q, k, v)
+            >>>
+            >>> class LlamaLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.attention = AttentionLayer()
+            ...
+            ...     def forward(self, input, label):
+            ...         return self.attention(input)
+            >>>
+            >>> class LlamaForCausalLayer(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.llama = LlamaLayer()
+            ...         self.weight = self.create_parameter(shape=[64, 1024])
+            ...         self.loss_func = paddle.nn.CrossEntropyLoss()
+            ...
+            ...     def forward(self, input, label):
+            ...         out = self.llama(input, label)
+            ...         logits = paddle.matmul(out, self.weight)
+            ...         loss = self.loss_func(logits, label)
+            ...         return logits
+            >>>
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> layer = LlamaForCausalLayer()
+            >>> mp_config = {
+            ...     'llama': dist.PrepareContextParallel('p2p'),
+            ...     'sdpa': dist.ContextParallel('p2p'),
+            ... }
     """
 
     def __init__(self, backend: str = 'p2p') -> None:
@@ -245,7 +249,9 @@ class ContextParallel(PlanBase):
         ...         super().__init__()
         ...
         ...     def forward(self, q, k, v):
-        ...         return paddle.nn.functional.scaled_dot_product_attention(q, k, v)
+        ...         return paddle.nn.functional.scaled_dot_product_attention(
+        ...             q, k, v
+        ...         )
         >>>
         >>> class AttentionLayer(paddle.nn.Layer):
         ...     def __init__(self):
 
@@ -257,15 +257,21 @@ def measure_program_real_op_cost(
     Example
     -----------
     * Profiling a simple program from scratch:
-    >>> from paddle.distributed.auto_parallel.static.utils import measure_program_real_op_cost
-    >>> program = ... # build your own program object here.
+    >>> from paddle.distributed.auto_parallel.static.utils import (
+    ...     measure_program_real_op_cost,
+    ... )
+    >>> program = ...  # build your own program object here.
     >>> measure_program_real_op_cost(
     >>>     program, verbose_level=1
     >>> )
     * Profiling a program which is already embedded into an Executor or some other class instance:
     >>> import paddle
-    >>> from paddle.distributed.auto_parallel.static.utils import measure_program_real_op_cost
-    >>> place: str = paddle.device.get_device() # here we assume place = "cuda:x"
+    >>> from paddle.distributed.auto_parallel.static.utils import (
+    ...     measure_program_real_op_cost,
+    ... )
+    >>> place: str = (
+    ...     paddle.device.get_device()
+    ... )  # here we assume place = "cuda:x"
     >>> place = paddle.CUDAPlace(int(place.split(':')[1]))
     >>> # here "program" is an inner object that has already been built before
     >>> measure_program_real_op_cost(program, verbose_level=1)
 
@@ -50,9 +50,15 @@ class HybridParallelInferenceHelper:
             >>> # while op pattern
             >>> with paddle.base.device_guard(f'{device}:all'):
             ...     # init global cond
-            ...     max_len = paddle.full(shape=[1], dtype="int64", fill_value=10)
-            ...     step_idx = paddle.full(shape=[1], dtype="int64", fill_value=0)
-            ...     cond_int = paddle.full(shape=[1], dtype="int64", fill_value=0, name="cond_int")
+            ...     max_len = paddle.full(
+            ...         shape=[1], dtype="int64", fill_value=10
+            ...     )
+            ...     step_idx = paddle.full(
+            ...         shape=[1], dtype="int64", fill_value=0
+            ...     )
+            ...     cond_int = paddle.full(
+            ...         shape=[1], dtype="int64", fill_value=0, name="cond_int"
+            ...     )
             ...     cond = layers.cast(step_idx < max_len, dtype="bool")
             ...     while_op = layers.While(cond, is_test=True)
 
@@ -62,22 +68,30 @@ class HybridParallelInferenceHelper:
             >>> with while_op.block():
             ...     with paddle.base.device_guard(f'{device}:all'):
             ...         # read data from global lod_tensor_array
-            ...         element_in_arr = paddle.tensor.array_read(array=arr, i=step_idx)
+            ...         element_in_arr = paddle.tensor.array_read(
+            ...             array=arr, i=step_idx
+            ...         )
             ...         # write placeholder data to global lod_tensor_array,
             ...         # it need for send_v2 of lod_tensor_array
             ...         paddle.increment(x=step_idx, value=1.0)
-            ...         paddle.tensor.array_write(element_in_arr, i=step_idx, array=arr)
+            ...         paddle.tensor.array_write(
+            ...             element_in_arr, i=step_idx, array=arr
+            ...         )
             ...     with paddle.base.device_guard(f'{device}:0'):
-            ...         pass # some code
+            ...         pass  # some code
             ...     with paddle.base.device_guard(f'{device}:1'):
-            ...         pass # some code
-            ...     with paddle.base.device_guard(f'{device}:{num_pp-1}'):
+            ...         pass  # some code
+            ...     with paddle.base.device_guard(f'{device}:{num_pp - 1}'):
             ...         # generate some data in while block and write to global lod_tensor_array
             ...         # that they are read in next while step.
             ...         # we will using send_v2 to send global lod_tensor_array to other pipeline and sync
-            ...         paddle.tensor.array_write(other_var, i=step_idx, array=arr)
+            ...         paddle.tensor.array_write(
+            ...             other_var, i=step_idx, array=arr
+            ...         )
             ...         # update cond and assign to cond_int, we will sync cond_int
-            ...         layers.assign(layers.cast(cond, dtype="int32"), cond_int)
+            ...         layers.assign(
+            ...             layers.cast(cond, dtype="int32"), cond_int
+            ...         )
             ...     with paddle.base.device_guard(f'{model._device}:all'):
             ...         # the code below must at end of while block and exists in device:all
             ...         layers.assign(layers.cast(cond_int, dtype='bool'), cond)
 
@@ -552,9 +552,9 @@ def fused_multi_head_attention(
         >>> out = matmul(out, qkv_weight) + qkv_bias
         >>> out = transpose(out, perm=[2, 0, 3, 1, 4])
         >>> # extract q, k and v from out
-        >>> q = out[0:1,::] * (head_dim ** -0.5)
-        >>> k = out[1:2,::]
-        >>> v = out[2:3,::]
+        >>> q = out[0:1, ::] * (head_dim**-0.5)
+        >>> k = out[1:2, ::]
+        >>> v = out[2:3, ::]
         >>> out = matmul(q, k, transpose_y=True)
         >>> out = out + attn_mask
         >>> out = softmax(out)
@@ -653,7 +653,7 @@ def fused_multi_head_attention(
             ...     None, None, None, None, 1e-5, qkv_bias,
             ...     linear_bias, None, attn_mask)
             >>> print(output.shape)
-            [2, 4, 128]
+            paddle.Size([2, 4, 128])
     """
 
     seed = None
@@ -1103,7 +1103,7 @@ def fused_multi_transformer(
         >>> q = out[0:1, ::]
         >>> k = out[1:2, ::]
         >>> v = out[2:3, ::]
-        >>> out = q * k^t
+        >>> out = q * k ^ t
         >>> out = attn_mask + out
         >>> out = softmax(out)
         >>> out = dropout(out)
@@ -1115,7 +1115,7 @@ def fused_multi_transformer(
         ... else:
         ...     out = layer_norm(x + dropout(out + bias))
 
-        >>> residual = out;
+        >>> residual = out
         >>> if pre_layer_norm:
         ...     out = ffn_layer_norm(out)
         >>> out = ffn1_linear(out)