Update on "[ET-VK][ez] Fuse update_cache + custom_sdpa into sdpa_with_kv_cache"

ssjia · ssjia · commit 9bdd6dce8d98 · 2025-11-05T17:19:22.000-08:00
SDPA used to be handled by a custom op `sdpa_with_kv_cache`, but it was eventually split (D62301837) into update_cache and custom_sdpa ops. However, having a single fused op is useful for Vulkan since it allows more control over how the cache tensors are stored and represented. Essentially, it makes it easier to manage the cache tensors and opens up opportunities for future optimizations. This diff introduces a fusion pass that does 2 things: 1. Combine update_cache and custom_sdpa back into sdpa_with_kv_cache 2. Ensure all references to the cache_pos symint use the same node - this prevents the select_at_dim_as_symint op from being called every time it is used. Differential Revision: [D86340339](https://our.internmc.facebook.com/intern/diff/D86340339/) [ghstack-poisoned]
diff --git a/backends/vulkan/patterns/sdpa.py b/backends/vulkan/patterns/sdpa.py
@@ -6,6 +6,8 @@
 
 from typing import Any, Optional
 
+import executorch.backends.vulkan.utils as utils
+
 import torch
 
 from executorch.backends.vulkan.patterns.pattern_registry import (
@@ -15,31 +17,18 @@
 )
 
 from executorch.exir import ExportedProgram
-from executorch.exir.dialects._ops import ops as exir_ops
 
 
 def is_update_cache_node(node: Any) -> bool:
-    if not hasattr(node, "target"):
-        return False
+    return utils.node_has_target(node, "llama::update_cache")
 
-    if isinstance(node.target, str):
-        return node.target == "llama::update_cache"
-    elif hasattr(node.target, "name"):
-        return node.target.name() == "llama::update_cache"
-    else:
-        return False
 
+def is_custom_sdpa_node(node: Any) -> bool:
+    return utils.node_has_target(node, "llama::custom_sdpa")
 
-def is_sdpa_with_kv_cache_node(node: Any) -> bool:
-    if not hasattr(node, "target"):
-        return False
 
-    if isinstance(node.target, str):
-        return "sdpa_with_kv_cache" in node.target
-    elif hasattr(node.target, "name"):
-        return "sdpa_with_kv_cache" in node.target.name()
-    else:
-        return False
+def is_sdpa_with_kv_cache_node(node: Any) -> bool:
+    return utils.node_has_target(node, "llama::sdpa_with_kv_cache")
 
 
 class CausalSDPAMatch(PatternMatch):
@@ -97,7 +86,7 @@ def __init__(self, custom_sdpa_node: torch.fx.Node) -> None:
 def find_causal_sdpa_patterns(
     node: torch.fx.Node,
 ) -> Optional[CausalSDPAMatch]:
-    if node.target != exir_ops.edge.llama.custom_sdpa.default:
+    if not is_custom_sdpa_node(node):
         return None
 
     matched_pattern = CausalSDPAMatch(node)
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
@@ -373,6 +373,18 @@ def find_quant_user(node: torch.fx.Node) -> Optional[torch.fx.Node]:
     return None
 
 
+def node_has_target(node: Any, target: str):
+    if not hasattr(node, "target"):
+        return False
+
+    if isinstance(node.target, str):
+        return node.target == target
+    elif hasattr(node.target, "name"):
+        return node.target.name() == target
+
+    return False
+
+
 ##
 ## Memory Layout, Storage Type Determination
 ##