Skip to content

Commit ff60bf3

Browse files
committed
Merge remote-tracking branch 'nm/sage/amd-deepseek' into upstream_merge_25_03_10
2 parents 9ef3d37 + 8f9664d commit ff60bf3

File tree

2 files changed

+3
-17
lines changed

2 files changed

+3
-17
lines changed

vllm/attention/backends/mla/common.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1327,21 +1327,7 @@ def _compute_prefill_context(
13271327
[0, q.shape[-1] - v.shape[-1]],
13281328
value=0)
13291329

1330-
if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN:
1331-
attn_output, attn_softmax_lse = self.triton_fa_func(
1332-
q,
1333-
k,
1334-
v_padded,
1335-
None,
1336-
prefill_metadata.query_start_loc,
1337-
prefill_metadata.context_chunk_cu_seq_lens[i],
1338-
prefill_metadata.max_query_len,
1339-
prefill_metadata.context_chunk_max_seq_lens[i],
1340-
False, # causal
1341-
self.scale,
1342-
None, # attn_mask is None unless applying ALiBi mask
1343-
)
1344-
elif is_vllm_fa:
1330+
if is_vllm_fa:
13451331
attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
13461332
q=q,
13471333
k=k,

vllm/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3474,9 +3474,9 @@ def __post_init__(self):
34743474
self.compilation_config.level = CompilationLevel.NO_COMPILATION
34753475

34763476
if self.model_config and self.model_config.use_mla and \
3477-
not current_platform.is_cuda():
3477+
not (current_platform.is_cuda() or current_platform.is_rocm()):
34783478
logger.info(
3479-
"MLA is enabled on a non-cuda platform; forcing chunked "
3479+
"MLA is enabled on a non-GPU platform; forcing chunked "
34803480
"prefill and prefix caching to be disabled.")
34813481
self.scheduler_config.enable_chunked_prefill = False
34823482
self.scheduler_config.chunked_prefill_enabled = False

0 commit comments

Comments
 (0)