Skip to content

Commit d455181

Browse files
authored
[CI] Compatible with paddle.where (#9534)
* fix
1 parent fc25d32 commit d455181

File tree

9 files changed

+14
-13
lines changed

9 files changed

+14
-13
lines changed

llm/experimental/ernie-3.5-se/modeling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ class BFloatFInfo:
135135

136136
def masked_fill(x, mask, value):
137137
y = paddle.full(x.shape, value, x.dtype)
138-
return paddle.where(mask, y, x)
138+
return paddle.where(mask.to("bool"), y, x)
139139

140140

141141
def scaled_dot_product_attention(

paddlenlp/data/data_collator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,7 @@ def paddle_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = N
571571

572572
def masked_fill(x, mask, value):
573573
y = paddle.full(x.shape, value, x.dtype)
574-
return paddle.where(mask, y, x)
574+
return paddle.where(mask.to("bool"), y, x)
575575

576576
# probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
577577
probability_matrix = masked_fill(probability_matrix, special_tokens_mask, value=0.0)
@@ -789,6 +789,7 @@ def paddle_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
789789
]
790790

791791
def masked_fill(x, mask, value):
792+
mask = mask.astype("bool")
792793
y = paddle.full(x.shape, value, x.dtype)
793794
return paddle.where(mask, y, x)
794795

paddlenlp/transformers/bloom/modeling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -855,7 +855,7 @@ def _prepare_attn_mask(
855855
# Attention score will be cast to float32 in the following calculation, therefore we set attention_mask dtype as float32
856856
zero = paddle.zeros(expanded_attn_mask.shape, dtype=paddle.float32)
857857
neg_inf = paddle.full(expanded_attn_mask.shape, paddle.finfo(paddle.float32).min, dtype=paddle.float32)
858-
expanded_attn_mask = paddle.where(expanded_attn_mask, zero, neg_inf)
858+
expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), zero, neg_inf)
859859
batch_size, num_heads, sq_len, kv_len = expanded_attn_mask.shape
860860
return expanded_attn_mask.reshape([batch_size * num_heads, sq_len, kv_len])
861861

paddlenlp/transformers/codegen/modeling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def _attn(self, query, key, value, attention_mask=None):
135135
attn_weights = attn_weights / self.scale_attn
136136
mask_value = paddle.to_tensor(-1e4, dtype=attn_weights.dtype)
137137
# Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
138-
attn_weights = paddle.where(causal_mask, attn_weights, mask_value)
138+
attn_weights = paddle.where(causal_mask.to("bool"), attn_weights, mask_value)
139139

140140
if attention_mask is not None:
141141
# Apply the attention mask

paddlenlp/transformers/gemma/modeling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1135,7 +1135,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
11351135
else:
11361136
expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
11371137
# Convert bool attention_mask to float attention mask, which will be added to attention_scores later
1138-
expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
1138+
expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
11391139
return expanded_attn_mask
11401140

11411141
@paddle.jit.not_to_static

paddlenlp/transformers/gptj/modeling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def _attn(
152152
# Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
153153
# Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
154154
mask_value = paddle.to_tensor(mask_value, dtype=attn_weights.dtype, place=attn_weights.place)
155-
attn_weights = paddle.where(causal_mask, attn_weights, mask_value)
155+
attn_weights = paddle.where(causal_mask.to("bool"), attn_weights, mask_value)
156156

157157
attn_weights = attn_weights / self.scale_attn
158158

paddlenlp/transformers/mixtral/modeling.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ def scaled_dot_product_attention(
299299

300300
def masked_fill(x, mask, value):
301301
y = paddle.full(x.shape, value, x.dtype)
302-
return paddle.where(mask, y, x)
302+
return paddle.where(mask.to("bool"), y, x)
303303

304304

305305
def is_casual_mask(attention_mask):
@@ -519,7 +519,7 @@ def forward(self, hidden_states):
519519
# this will be used to easily index which expert is going to be sollicitated.
520520
# shape: [num_experts, top_k, batch_size * seq_len]
521521
expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).transpose([2, 1, 0])
522-
522+
expert_mask = expert_mask.to("bool")
523523
# Loop over all available experts in the model and perform the computation on each expert.
524524
for expert_id in range(self.num_experts):
525525
expert_layer = self.experts[expert_id]
@@ -1098,7 +1098,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
10981098
past_key_values_length=past_key_values_length,
10991099
)
11001100
# Convert bool attention_mask to float attention mask, which will be added to attention_scores later
1101-
expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
1101+
expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
11021102
return expanded_attn_mask
11031103

11041104
@paddle.jit.not_to_static

paddlenlp/transformers/qwen2/modeling.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ def scaled_dot_product_attention(
233233

234234
def masked_fill(x, mask, value):
235235
y = paddle.full(x.shape, value, x.dtype)
236-
return paddle.where(mask, y, x)
236+
return paddle.where(mask.to("bool"), y, x)
237237

238238

239239
def is_casual_mask(attention_mask):
@@ -1020,7 +1020,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
10201020
past_key_values_length=past_key_values_length,
10211021
)
10221022
# Convert bool attention_mask to float attention mask, which will be added to attention_scores later
1023-
expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
1023+
expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
10241024
return expanded_attn_mask
10251025

10261026
@paddle.jit.not_to_static

paddlenlp/transformers/qwen2_moe/modeling.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ def scaled_dot_product_attention(
300300

301301
def masked_fill(x, mask, value):
302302
y = paddle.full(x.shape, value, x.dtype)
303-
return paddle.where(mask, y, x)
303+
return paddle.where(mask.to("bool"), y, x)
304304

305305

306306
def is_casual_mask(attention_mask):
@@ -1124,7 +1124,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
11241124
past_key_values_length=past_key_values_length,
11251125
)
11261126
# Convert bool attention_mask to float attention mask, which will be added to attention_scores later
1127-
expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
1127+
expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
11281128
return expanded_attn_mask
11291129

11301130
@paddle.jit.not_to_static

0 commit comments

Comments
 (0)