Skip to content

Commit 530cae4

Browse files
authored
[CodeStyle] Enable docstring code format and start to use pycon as syntax highlight marker (PaddlePaddle#76542)
1 parent 34ebd2d commit 530cae4

File tree

17 files changed

+448
-363
lines changed

17 files changed

+448
-363
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ target-version = "py39"
1010
[tool.ruff.format]
1111
# Prevent change to double quotes by some users use ruff format
1212
quote-style = "preserve"
13+
docstring-code-format = true
1314

1415
[tool.ruff.lint]
1516
select = [

python/paddle/device/__init__.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -697,9 +697,11 @@ def get_default_device() -> paddle.device:
697697
Returns:
698698
str: The default device for PaddlePaddle.
699699
Example:
700-
.. code-block:: python
701-
import paddle
702-
print(paddle.get_default_device())
700+
.. code-block:: pycon
701+
702+
>>> import paddle
703+
704+
>>> print(paddle.get_default_device())
703705
"""
704706
return paddle.device(get_device().replace("gpu", "cuda"))
705707

@@ -1093,7 +1095,7 @@ class Event:
10931095
```python
10941096
# New usage
10951097
paddle.set_device("gpu:0") # Set device first
1096-
e = paddle.device.Event() # Will use gpu:0
1098+
e = paddle.device.Event() # Will use gpu:0
10971099
```
10981100
10991101
paddle.device.Event is equivalent to paddle.cuda.Event.

python/paddle/distributed/auto_parallel/intermediate/context_parallel.py

Lines changed: 75 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -40,74 +40,78 @@ class PrepareContextParallel(PlanBase):
4040
backend (string): select strategy for context parallel, now support 'p2p' and 'all2all'.
4141
4242
Examples:
43-
.. code-block:: python
44-
45-
>>> import paddle
46-
>>> import paddle.distributed as dist
47-
48-
>>> class SDPALayer(paddle.nn.Layer):
49-
... def __init__(self):
50-
... super().__init__()
51-
...
52-
... def forward(self, q, k, v):
53-
... return paddle.nn.functional.scaled_dot_product_attention(q, k, v)
54-
>>>
55-
>>> class AttentionLayer(paddle.nn.Layer):
56-
... def __init__(self):
57-
... super().__init__()
58-
... self.hidden_size = 64
59-
... self.num_key_value_heads = 10
60-
... self.head_dim = 64
61-
... self.sdpa = SDPALayer()
62-
... self.q = paddle.nn.Linear(
63-
... self.hidden_size,
64-
... self.hidden_size,
65-
... bias_attr=False,
66-
... )
67-
... self.k = paddle.nn.Linear(
68-
... self.hidden_size,
69-
... self.num_key_value_heads * self.head_dim,
70-
... bias_attr=False,
71-
... )
72-
... self.v = paddle.nn.Linear(
73-
... self.hidden_size,
74-
... self.num_key_value_heads * self.head_dim,
75-
... bias_attr=False,
76-
... )
77-
...
78-
... def forward(self, input):
79-
... q = self.q(input)
80-
... k = self.k(input)
81-
... v = self.v(input)
82-
... return self.sdpa(q, k, v)
83-
>>>
84-
>>> class LlamaLayer(paddle.nn.Layer):
85-
... def __init__(self):
86-
... super().__init__()
87-
... self.attention = AttentionLayer()
88-
...
89-
... def forward(self, input, label):
90-
... return self.attention(input)
91-
>>>
92-
>>> class LlamaForCausalLayer(paddle.nn.Layer):
93-
... def __init__(self):
94-
... super().__init__()
95-
... self.llama = LlamaLayer()
96-
... self.weight = self.create_parameter(shape=[64, 1024])
97-
... self.loss_func = paddle.nn.CrossEntropyLoss()
98-
...
99-
... def forward(self, input, label):
100-
... out = self.llama(input, label)
101-
... logits = paddle.matmul(out, self.weight)
102-
... loss = self.loss_func(logits, label)
103-
... return logits
104-
>>>
105-
>>> # doctest: +REQUIRES(env:DISTRIBUTED)
106-
>>> layer = LlamaForCausalLayer()
107-
>>> mp_config = {
108-
... 'llama': dist.PrepareContextParallel('p2p'),
109-
... 'sdpa': dist.ContextParallel('p2p'),
110-
... }
43+
.. code-block:: pycon
44+
45+
>>> import paddle
46+
>>> import paddle.distributed as dist
47+
48+
>>> class SDPALayer(paddle.nn.Layer):
49+
... def __init__(self):
50+
... super().__init__()
51+
...
52+
... def forward(self, q, k, v):
53+
... return (
54+
... paddle.nn.functional.scaled_dot_product_attention(
55+
... q, k, v
56+
... )
57+
... )
58+
>>>
59+
>>> class AttentionLayer(paddle.nn.Layer):
60+
... def __init__(self):
61+
... super().__init__()
62+
... self.hidden_size = 64
63+
... self.num_key_value_heads = 10
64+
... self.head_dim = 64
65+
... self.sdpa = SDPALayer()
66+
... self.q = paddle.nn.Linear(
67+
... self.hidden_size,
68+
... self.hidden_size,
69+
... bias_attr=False,
70+
... )
71+
... self.k = paddle.nn.Linear(
72+
... self.hidden_size,
73+
... self.num_key_value_heads * self.head_dim,
74+
... bias_attr=False,
75+
... )
76+
... self.v = paddle.nn.Linear(
77+
... self.hidden_size,
78+
... self.num_key_value_heads * self.head_dim,
79+
... bias_attr=False,
80+
... )
81+
...
82+
... def forward(self, input):
83+
... q = self.q(input)
84+
... k = self.k(input)
85+
... v = self.v(input)
86+
... return self.sdpa(q, k, v)
87+
>>>
88+
>>> class LlamaLayer(paddle.nn.Layer):
89+
... def __init__(self):
90+
... super().__init__()
91+
... self.attention = AttentionLayer()
92+
...
93+
... def forward(self, input, label):
94+
... return self.attention(input)
95+
>>>
96+
>>> class LlamaForCausalLayer(paddle.nn.Layer):
97+
... def __init__(self):
98+
... super().__init__()
99+
... self.llama = LlamaLayer()
100+
... self.weight = self.create_parameter(shape=[64, 1024])
101+
... self.loss_func = paddle.nn.CrossEntropyLoss()
102+
...
103+
... def forward(self, input, label):
104+
... out = self.llama(input, label)
105+
... logits = paddle.matmul(out, self.weight)
106+
... loss = self.loss_func(logits, label)
107+
... return logits
108+
>>>
109+
>>> # doctest: +REQUIRES(env:DISTRIBUTED)
110+
>>> layer = LlamaForCausalLayer()
111+
>>> mp_config = {
112+
... 'llama': dist.PrepareContextParallel('p2p'),
113+
... 'sdpa': dist.ContextParallel('p2p'),
114+
... }
111115
"""
112116

113117
def __init__(self, backend: str = 'p2p') -> None:
@@ -245,7 +249,9 @@ class ContextParallel(PlanBase):
245249
... super().__init__()
246250
...
247251
... def forward(self, q, k, v):
248-
... return paddle.nn.functional.scaled_dot_product_attention(q, k, v)
252+
... return paddle.nn.functional.scaled_dot_product_attention(
253+
... q, k, v
254+
... )
249255
>>>
250256
>>> class AttentionLayer(paddle.nn.Layer):
251257
... def __init__(self):

python/paddle/distributed/auto_parallel/static/cost/op_runtime_cost.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -257,15 +257,21 @@ def measure_program_real_op_cost(
257257
Example
258258
-----------
259259
* Profiling a simple program from scratch:
260-
>>> from paddle.distributed.auto_parallel.static.utils import measure_program_real_op_cost
261-
>>> program = ... # build your own program object here.
260+
>>> from paddle.distributed.auto_parallel.static.utils import (
261+
... measure_program_real_op_cost,
262+
... )
263+
>>> program = ... # build your own program object here.
262264
>>> measure_program_real_op_cost(
263265
>>> program, verbose_level=1
264266
>>> )
265267
* Profiling a program which is already embedded into an Executor or some other class instance:
266268
>>> import paddle
267-
>>> from paddle.distributed.auto_parallel.static.utils import measure_program_real_op_cost
268-
>>> place: str = paddle.device.get_device() # here we assume place = "cuda:x"
269+
>>> from paddle.distributed.auto_parallel.static.utils import (
270+
... measure_program_real_op_cost,
271+
... )
272+
>>> place: str = (
273+
... paddle.device.get_device()
274+
... ) # here we assume place = "cuda:x"
269275
>>> place = paddle.CUDAPlace(int(place.split(':')[1]))
270276
>>> # here "program" is an inner object that has already been built before
271277
>>> measure_program_real_op_cost(program, verbose_level=1)

python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,15 @@ class HybridParallelInferenceHelper:
5050
>>> # while op pattern
5151
>>> with paddle.base.device_guard(f'{device}:all'):
5252
... # init global cond
53-
... max_len = paddle.full(shape=[1], dtype="int64", fill_value=10)
54-
... step_idx = paddle.full(shape=[1], dtype="int64", fill_value=0)
55-
... cond_int = paddle.full(shape=[1], dtype="int64", fill_value=0, name="cond_int")
53+
... max_len = paddle.full(
54+
... shape=[1], dtype="int64", fill_value=10
55+
... )
56+
... step_idx = paddle.full(
57+
... shape=[1], dtype="int64", fill_value=0
58+
... )
59+
... cond_int = paddle.full(
60+
... shape=[1], dtype="int64", fill_value=0, name="cond_int"
61+
... )
5662
... cond = layers.cast(step_idx < max_len, dtype="bool")
5763
... while_op = layers.While(cond, is_test=True)
5864
@@ -62,22 +68,30 @@ class HybridParallelInferenceHelper:
6268
>>> with while_op.block():
6369
... with paddle.base.device_guard(f'{device}:all'):
6470
... # read data from global lod_tensor_array
65-
... element_in_arr = paddle.tensor.array_read(array=arr, i=step_idx)
71+
... element_in_arr = paddle.tensor.array_read(
72+
... array=arr, i=step_idx
73+
... )
6674
... # write placeholder data to global lod_tensor_array,
6775
... # it need for send_v2 of lod_tensor_array
6876
... paddle.increment(x=step_idx, value=1.0)
69-
... paddle.tensor.array_write(element_in_arr, i=step_idx, array=arr)
77+
... paddle.tensor.array_write(
78+
... element_in_arr, i=step_idx, array=arr
79+
... )
7080
... with paddle.base.device_guard(f'{device}:0'):
71-
... pass # some code
81+
... pass # some code
7282
... with paddle.base.device_guard(f'{device}:1'):
73-
... pass # some code
74-
... with paddle.base.device_guard(f'{device}:{num_pp-1}'):
83+
... pass # some code
84+
... with paddle.base.device_guard(f'{device}:{num_pp - 1}'):
7585
... # generate some data in while block and write to global lod_tensor_array
7686
... # that they are read in next while step.
7787
... # we will using send_v2 to send global lod_tensor_array to other pipeline and sync
78-
... paddle.tensor.array_write(other_var, i=step_idx, array=arr)
88+
... paddle.tensor.array_write(
89+
... other_var, i=step_idx, array=arr
90+
... )
7991
... # update cond and assign to cond_int, we will sync cond_int
80-
... layers.assign(layers.cast(cond, dtype="int32"), cond_int)
92+
... layers.assign(
93+
... layers.cast(cond, dtype="int32"), cond_int
94+
... )
8195
... with paddle.base.device_guard(f'{model._device}:all'):
8296
... # the code below must at end of while block and exists in device:all
8397
... layers.assign(layers.cast(cond_int, dtype='bool'), cond)

python/paddle/incubate/nn/functional/fused_transformer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -552,9 +552,9 @@ def fused_multi_head_attention(
552552
>>> out = matmul(out, qkv_weight) + qkv_bias
553553
>>> out = transpose(out, perm=[2, 0, 3, 1, 4])
554554
>>> # extract q, k and v from out
555-
>>> q = out[0:1,::] * (head_dim ** -0.5)
556-
>>> k = out[1:2,::]
557-
>>> v = out[2:3,::]
555+
>>> q = out[0:1, ::] * (head_dim**-0.5)
556+
>>> k = out[1:2, ::]
557+
>>> v = out[2:3, ::]
558558
>>> out = matmul(q, k, transpose_y=True)
559559
>>> out = out + attn_mask
560560
>>> out = softmax(out)
@@ -653,7 +653,7 @@ def fused_multi_head_attention(
653653
... None, None, None, None, 1e-5, qkv_bias,
654654
... linear_bias, None, attn_mask)
655655
>>> print(output.shape)
656-
[2, 4, 128]
656+
paddle.Size([2, 4, 128])
657657
"""
658658

659659
seed = None
@@ -1103,7 +1103,7 @@ def fused_multi_transformer(
11031103
>>> q = out[0:1, ::]
11041104
>>> k = out[1:2, ::]
11051105
>>> v = out[2:3, ::]
1106-
>>> out = q * k^t
1106+
>>> out = q * k ^ t
11071107
>>> out = attn_mask + out
11081108
>>> out = softmax(out)
11091109
>>> out = dropout(out)
@@ -1115,7 +1115,7 @@ def fused_multi_transformer(
11151115
... else:
11161116
... out = layer_norm(x + dropout(out + bias))
11171117
1118-
>>> residual = out;
1118+
>>> residual = out
11191119
>>> if pre_layer_norm:
11201120
... out = ffn_layer_norm(out)
11211121
>>> out = ffn1_linear(out)

0 commit comments

Comments
 (0)