TP: Add (slower) SDPA fallback mode when flash-attn is unavailable

turboderp · turboderp · commit ea27954767a2 · 2024-08-29T22:39:45.000+02:00
diff --git a/exllamav2/attn.py b/exllamav2/attn.py
@@ -831,6 +831,9 @@ def forward_paged_tp_old(
 
     def _attn_torch(self, batch_size, q_len, q_states, k_states, v_states, attn_params, cfg):
 
+        num_attn_heads = q_states.shape[2]
+        head_dim = q_states.shape[3]
+
         q_states = q_states.transpose(1, 2)
         k_states = k_states.transpose(1, 2)
         v_states = v_states.transpose(1, 2)
@@ -881,7 +884,7 @@ def _attn_torch(self, batch_size, q_len, q_states, k_states, v_states, attn_para
             attn_output = torch.matmul(attn_weights, v_states)
 
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape((batch_size, q_len, cfg.num_attention_heads * cfg.head_dim))
+        attn_output = attn_output.reshape((batch_size, q_len, num_attn_heads * head_dim))
         return attn_output
 
 
@@ -955,8 +958,10 @@ def forward(self,
                 loras: list[ExLlamaV2Lora] | None = None,
                 **kwargs) -> torch.Tensor | dict[str: torch.Tensor]:
 
+        cfg = self.model.config
         global has_flash_attn
         global has_xformers
+        use_flash_attn = has_flash_attn and not cfg.no_flash_attn
 
         if isinstance(attn_params, ExLlamaV2Attention.PagedParams):
             return self.forward_paged(
@@ -968,7 +973,7 @@ def forward(self,
             )
 
         if self.is_tp:
-            if cache is not None:
+            if cache is not None and use_flash_attn:
                 return self.forward_tp(
                     hidden_states,
                     cache,
@@ -1002,7 +1007,6 @@ def forward(self,
                 **kwargs
             )
 
-        cfg = self.model.config
         constants = self.model.get_device_context(self.device_idx)
 
         batch_size, q_len, _ = hidden_states.shape
@@ -1193,7 +1197,10 @@ def forward_tp_old(
 
         assert self.q_handle is not None
         use_flash_attn = has_flash_attn and not cfg.no_flash_attn
-        assert use_flash_attn, "Tensor parallel inference requires flash-attn"
+        if not use_flash_attn:
+            assert has_lower_right_sdpa and attn_params.is_causal() and not cfg.no_sdpa and not cfg.attn_logit_softcapping, \
+                "TP attention without flash-attn must use Torch SDPA with lower-right attention mask " \
+                "(use PyTorch 2.4.0+) and does not support logit softcapping."
 
         hidden_states = self.model.tp_context.broadcast(0, hidden_states, BROADCAST_KV, dim = cfg.head_dim)
 
@@ -1236,24 +1243,50 @@ def forward_tp_old(
             torch.cuda.set_stream(context.stream)
 
             if k_cache is not None:
-                attn_output = flash_attn_with_kvcache(
-                    q = q[idx],
-                    k = k[idx],
-                    v = v[idx],
-                    k_cache = k_cache[idx],
-                    v_cache = v_cache[idx],
-                    causal = True,
-                    softmax_scale = self.scaling,
-                    cache_seqlens = attn_params.past_len_tp[idx]
-                )
+                if use_flash_attn:
+                    attn_output = flash_attn_with_kvcache(
+                        q = q[idx],
+                        k = k[idx],
+                        v = v[idx],
+                        k_cache = k_cache[idx],
+                        v_cache = v_cache[idx],
+                        causal = True,
+                        softmax_scale = self.scaling,
+                        cache_seqlens = attn_params.past_len_tp[idx]
+                    )
+                else:
+                    cache_a = attn_params.past_len
+                    cache_b = attn_params.past_len + q_len
+                    k_cache[idx][:batch_size, cache_a:cache_b, :, :].copy_(k[idx])
+                    v_cache[idx][:batch_size, cache_a:cache_b, :, :].copy_(v[idx])
+                    attn_output = self._attn_torch(
+                        batch_size,
+                        q_len,
+                        q[idx],
+                        k_cache[idx][:batch_size, :cache_b, :, :],
+                        v_cache[idx][:batch_size, :cache_b, :, :],
+                        attn_params,
+                        cfg
+                    )
             else:
-                attn_output = flash_attn_func(
-                    q[idx],
-                    k[idx],
-                    v[idx],
-                    causal = True,
-                    softmax_scale=self.scaling,
-                )
+                if use_flash_attn:
+                    attn_output = flash_attn_func(
+                        q[idx],
+                        k[idx],
+                        v[idx],
+                        causal = True,
+                        softmax_scale = self.scaling,
+                    )
+                else:
+                    attn_output = self._attn_torch(
+                        batch_size,
+                        q_len,
+                        q[idx],
+                        k[idx],
+                        v[idx],
+                        attn_params,
+                        cfg
+                    )
 
             attn_output = attn_output.view(batch_size * q_len, (b - a) * cfg.head_dim * cfg.num_key_value_groups)
             attn_outputs.append(attn_output)