Skip to content

Commit f024c28

Browse files
committed
Enables Doge SDPA backend support
Aligns the model metadata with the actual attention capability so future backends can reuse the shared attention implementation
1 parent ed981b0 commit f024c28

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

examples/modeling/modeling_doge.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
from transformers.utils.deprecation import deprecate_kwarg
4444
from transformers.utils.generic import OutputRecorder, check_model_inputs
4545
from .configuration_doge import DogeConfig
46+
from transformers.models.doge.modeling_doge import DogeAttention
4647

4748
try:
4849
from flash_sparse_attn.integrations.flash_sparse_attention import flash_sparse_attention_forward
@@ -372,10 +373,10 @@ class DogePreTrainedModel(PreTrainedModel):
372373
_no_split_modules = ["DogeDecoderLayer"]
373374
_skip_keys_device_placement = ["past_key_values"]
374375
_supports_flash_attn = False
375-
_supports_sdpa = False
376+
_supports_sdpa = True
376377
_supports_flex_attn = False
377378
_can_compile_fullgraph = False
378-
_supports_attention_backend = False
379+
_supports_attention_backend = True
379380
_can_record_outputs = {
380381
"router_logits": OutputRecorder(DogeCDMoE, index=1),
381382
"hidden_states": DogeDecoderLayer,

0 commit comments

Comments
 (0)