Skip to content

Commit 2440512

Browse files
authored
multi-gpu: fix tensor device placements for various models (#35763)
* milti-gpu: fix inputs_embeds + position_embeds Fixing the following errors in few models: ``` > hidden_states = inputs_embeds + pos_embeds E RuntimeError: Expected all tensors to be on the same device, but found at least two devices, xpu:2 and xpu:3! ``` Fixes: #35762 Signed-off-by: Dmitry Rogozhkin <[email protected]> * multi-gpu: fix tensor device placements for various models Fixes: #35762 Signed-off-by: Dmitry Rogozhkin <[email protected]> * Apply make fix-copies Signed-off-by: Dmitry Rogozhkin <[email protected]> --------- Signed-off-by: Dmitry Rogozhkin <[email protected]>
1 parent befea8c commit 2440512

File tree

64 files changed

+177
-66
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+177
-66
lines changed

src/transformers/models/aria/modeling_aria.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1115,7 +1115,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
11151115
if attention_mask is not None:
11161116
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
11171117
mask_length = attention_mask.shape[-1]
1118-
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1118+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
1119+
causal_mask.device
1120+
)
11191121
padding_mask = padding_mask == 0
11201122
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
11211123
padding_mask, min_dtype

src/transformers/models/bloom/modeling_bloom.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -845,7 +845,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
845845
if attention_mask is not None:
846846
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
847847
mask_length = attention_mask.shape[-1]
848-
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
848+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
849+
causal_mask.device
850+
)
849851
padding_mask = padding_mask == 0
850852
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
851853
padding_mask, min_dtype

src/transformers/models/chameleon/modeling_chameleon.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1491,7 +1491,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
14911491
if attention_mask is not None:
14921492
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
14931493
mask_length = attention_mask.shape[-1]
1494-
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1494+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
1495+
causal_mask.device
1496+
)
14951497
padding_mask = padding_mask == 0
14961498
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
14971499
padding_mask, min_dtype

src/transformers/models/codegen/modeling_codegen.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -688,7 +688,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
688688
if attention_mask is not None:
689689
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
690690
mask_length = attention_mask.shape[-1]
691-
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
691+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
692+
causal_mask.device
693+
)
692694
padding_mask = padding_mask == 0
693695
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
694696
padding_mask, min_dtype

src/transformers/models/cohere/modeling_cohere.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
765765
if attention_mask is not None:
766766
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
767767
mask_length = attention_mask.shape[-1]
768-
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
768+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
769+
causal_mask.device
770+
)
769771
padding_mask = padding_mask == 0
770772
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
771773
padding_mask, min_dtype

src/transformers/models/cohere2/modeling_cohere2.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -766,7 +766,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
766766
if attention_mask is not None:
767767
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
768768
mask_length = attention_mask.shape[-1]
769-
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
769+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
770+
causal_mask.device
771+
)
770772
padding_mask = padding_mask == 0
771773
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
772774
padding_mask, min_dtype

src/transformers/models/dbrx/modeling_dbrx.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1220,7 +1220,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
12201220
if attention_mask is not None:
12211221
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
12221222
mask_length = attention_mask.shape[-1]
1223-
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1223+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
1224+
causal_mask.device
1225+
)
12241226
padding_mask = padding_mask == 0
12251227
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
12261228
padding_mask, min_dtype

src/transformers/models/diffllama/modeling_diffllama.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1004,7 +1004,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
10041004
if attention_mask is not None:
10051005
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
10061006
mask_length = attention_mask.shape[-1]
1007-
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1007+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
1008+
causal_mask.device
1009+
)
10081010
padding_mask = padding_mask == 0
10091011
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
10101012
padding_mask, min_dtype

src/transformers/models/emu3/modeling_emu3.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1583,7 +1583,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
15831583
if attention_mask is not None:
15841584
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
15851585
mask_length = attention_mask.shape[-1]
1586-
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1586+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
1587+
causal_mask.device
1588+
)
15871589
padding_mask = padding_mask == 0
15881590
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
15891591
padding_mask, min_dtype

src/transformers/models/falcon/modeling_falcon.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1147,7 +1147,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
11471147
if attention_mask is not None:
11481148
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
11491149
mask_length = attention_mask.shape[-1]
1150-
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1150+
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
1151+
causal_mask.device
1152+
)
11511153
padding_mask = padding_mask == 0
11521154
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
11531155
padding_mask, min_dtype

0 commit comments

Comments
 (0)