Fix alt pos embeddings and block diagonal mask when flash-attn is disabled

turboderp · turboderp · commit 2e630aefdd48 · 2025-02-13T22:13:48.000+01:00
diff --git a/exllamav2/attn.py b/exllamav2/attn.py
@@ -882,8 +882,10 @@ def _attn_torch(self, batch_size, q_len, q_states, k_states, v_states, attn_para
                 k_states = k_states[:, :, -self.sliding_window:, :]
                 v_states = v_states[:, :, -self.sliding_window:, :]
 
-            if self.layer_idx in attn_params.block_diag_layers:
+            if self.layer_idx in attn_params.block_diag_layers or causal:
                 attn_mask_lr = attn_params.get_block_diag_mask(q_states.device)
+            elif not causal:
+                attn_mask_lr = None
             elif attn_params.is_causal():
                 attn_mask_lr = causal_lower_right(q_len, k_states.shape[2])
             else:
@@ -892,7 +894,7 @@ def _attn_torch(self, batch_size, q_len, q_states, k_states, v_states, attn_para
                 q_states,
                 k_states,
                 v_states,
-                attn_mask_lr if causal else None,
+                attn_mask_lr,
                 scale = self.scaling
             )
 
@@ -910,10 +912,12 @@ def _attn_torch(self, batch_size, q_len, q_states, k_states, v_states, attn_para
                 attn_mask = attn_params.get_block_diag_mask(attn_weights.device)
             elif causal:
                 attn_mask = attn_params.get_attn_mask(attn_weights.device)
+            else:
+                attn_mask = None
 
             if cfg.attn_logit_softcapping:
                 ext_c.softcap_(attn_weights, cfg.attn_logit_softcapping)
-            if causal and attn_mask is not None:
+            if attn_mask is not None:
                 attn_weights = attn_weights + attn_mask
             if self.sliding_window and k_states.shape[-1] >= self.sliding_window:
                 attn_weights = attn_weights[:, :, :, -self.sliding_window:]
@@ -1109,6 +1113,12 @@ def forward(
                 offset = attn_params.rope_offsets.cpu().item()
                 pass_past_len_1 += offset
 
+        sc = attn_params.get_alt_rope_embed(self.device_idx)
+        if not sc:
+            sin, cos = constants.sin, constants.cos
+        else:
+            sin, cos = sc
+
         ext_c.q_attn_forward_1(
             self.q_handle,
             hidden_states,
@@ -1119,8 +1129,8 @@ def forward(
             q_states,
             k_states,
             v_states,
-            constants.sin,
-            constants.cos,
+            sin,
+            cos,
             pass_loras,
             pass_lora_temp
         )
diff --git a/exllamav2/attn_params.py b/exllamav2/attn_params.py
@@ -194,7 +194,7 @@ def get_block_diag_mask(self, device: int) -> torch.Tensor | None:
                 return None
             positions = torch.arange(csl[-1], device = csl.device)
             labels = torch.searchsorted(csl[1:], positions, right = True)
-            self.block_diag_mask = labels.unsqueeze(0) == labels.unsqueeze(1).repeat(self.batch_size)
+            self.block_diag_mask = torch.where(labels.unsqueeze(0) == labels.unsqueeze(1).repeat(1, self.batch_size), 0, -65504.0).half()
         if self.block_diag_mask.device.index != device:
             self.block_diag_mask = safe_move_tensor(self.block_diag_mask, device, non_blocking = True)
         return self.block_diag_mask