Make a WA to avoid XPU crash for API PagedAttention.reshape_and_cache_flash (#1288)

kaixuanliu · web-flow · commit 5ffa0c0ad7f9 · 2025-05-15T09:02:17.000+02:00
diff --git a/optimum/exporters/ipex/cache_utils.py b/optimum/exporters/ipex/cache_utils.py
@@ -104,12 +104,15 @@ def reshape_and_cache(
     ):
         # TODO: unify API definition between CPU and XPU in IPEX version > 2.6
         if self.device.type == "xpu" and self._supports_flash_decoding:
+            # make a WA here as slots here is padded but XPU does not support slots with length not equal to key length, will fix it in IPEX 2.8
+            valid_len = key.shape[0]
+            truncated_slots = slots[:valid_len]
             PagedAttention.reshape_and_cache_flash(
                 key,
                 value,
                 key_cache,
                 value_cache,
-                slots,
+                truncated_slots,
             )
         else:
             PagedAttention.reshape_and_cache(
@@ -127,7 +130,7 @@ def alloc_slot_for_prefill(self, input_lens: torch.Tensor, batch_size: int):
         num_blocks = (input_lens + self.block_size - 1) // self.block_size
         for i in range(batch_size):
             nb = num_blocks[i]
-            scores = self.free_blocks * torch.arange(self.free_blocks.shape[0], 0, -1)
+            scores = self.free_blocks * torch.arange(self.free_blocks.shape[0], 0, -1, device=self.device)
             block_table = torch.topk(scores, nb).indices
             self.block_tables[i][0:nb] = block_table
             self.free_blocks[block_table] = 0
@@ -154,7 +157,7 @@ def alloc_slot_for_decode(self, batch_size: int):
                 b_idx = start_block_idx[i]
                 if self.block_tables[i][b_idx] == -1:
                     # Need a free block. Get indices of free blocks, select the first free block
-                    scores = self.free_blocks * torch.arange(self.free_blocks.shape[0], 0, -1)
+                    scores = self.free_blocks * torch.arange(self.free_blocks.shape[0], 0, -1, device=self.device)
                     self.block_tables[i][b_idx] = scores.argmax()
                     self.free_blocks[self.block_tables[i][b_idx]] = 0
             self.slots[i] = self.block_tables[i][start_block_idx[i]] * self.block_size + slot_offset_in_block[i]