Cache NPUKernel objects (#2611)

mawad-amd · web-flow · commit 69ffd7a153de · 2025-09-25T23:33:36.000Z
Signed-off-by: Muhammad Awad &lt;MuhammadAbdelghaffar.Awad@amd.com&gt;
diff --git a/.github/workflows/buildAndTestRyzenAI.yml b/.github/workflows/buildAndTestRyzenAI.yml
@@ -60,6 +60,8 @@ jobs:
       fail-fast: false
       matrix:
         runner_type: [ amd7940hs, amdhx370 ]
+    env:
+      IRON_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
     steps:
       - uses: actions/checkout@v4
         with:
@@ -126,6 +128,9 @@ jobs:
             -DMLIR_DIR=$PWD/../mlir/lib/cmake/mlir \
             $CMAKE_ARGS
 
+          # Create runner-specific cache directory
+          mkdir -p $IRON_CACHE_HOME
+
           ninja install
           ninja check-aie
           popd
@@ -137,6 +142,8 @@ jobs:
       fail-fast: false
       matrix:
         runner_type: [ amd7940hs, amdhx370 ]
+    env:
+      IRON_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
     steps:
       - uses: actions/checkout@v4
         with:
@@ -183,8 +190,10 @@ jobs:
             LIT_OPTS="-j12 $LIT_OPTS"
           fi
 
+          # Create runner-specific cache directory
+          mkdir -p $IRON_CACHE_HOME
+
           ninja install
           ninja check-reference-designs
           ninja check-programming-guide
-
-          popd
+          popd
diff --git a/python/iron/jit.py b/python/iron/jit.py
@@ -23,10 +23,44 @@
 from aie.dialects.aie import AIEDevice
 
 
-# The `iron.jit` decorator below caches compiled kenrels inside the `IRON_CACHE_DIR` directory.
+# The `iron.jit` decorator below caches compiled kenrels inside the `IRON_CACHE_HOME` directory.
 # Kernels are cached based on their hash value of the MLIR module string. If during compilation,
 # we hit in the cache, the `iron.jit` will load the xclbin and instruction binary files from the cache.
-IRON_CACHE_DIR = os.path.expanduser("~/.iron/cache")
+IRON_CACHE_HOME = os.environ.get("IRON_CACHE_HOME", os.path.expanduser("~/.iron/cache"))
+
+
+class CircularCache:
+    def __init__(self, max_size):
+        self.max_size = max_size
+        self.cache = [None] * max_size
+        self.keys = [None] * max_size
+        self.index = 0
+
+    def __contains__(self, key):
+        return key in self.keys
+
+    def __getitem__(self, key):
+        idx = self.keys.index(key)
+        return self.cache[idx]
+
+    def __setitem__(self, key, value):
+        self.cache[self.index] = value
+        self.keys[self.index] = key
+        self.index = (self.index + 1) % self.max_size
+
+    def __len__(self):
+        return sum(1 for k in self.keys if k is not None)
+
+    def clear(self):
+        self.cache = [None] * self.max_size
+        self.keys = [None] * self.max_size
+        self.index = 0
+
+
+# Global cache for compiled kernels at the function level
+# Key: (function_name, args_signature) -> NPUKernel instance
+# There is a limit on the number of kernels we have in cache
+_compiled_kernels = CircularCache(max_size=1)
 
 
 class NPUKernel:
@@ -117,8 +151,21 @@ def __del__(self):
         """
         Destructor to clean up resources and delete the kernel and device objects.
         """
-        del self.__kernel
-        del self.__device
+        if hasattr(self, "_NPUKernel__insts_buffer_bo"):
+            del self.__insts_buffer_bo
+            self.__insts_buffer_bo = None
+        if hasattr(self, "_NPUKernel__kernel"):
+            del self.__kernel
+            self.__kernel = None
+        if hasattr(self, "_NPUKernel__context"):
+            del self.__context
+            self.__context = None
+        if hasattr(self, "_NPUKernel__xclbin"):
+            del self.__xclbin
+            self.__xclbin = None
+        if hasattr(self, "_NPUKernel__device"):
+            del self.__device
+            self.__device = None
 
 
 class NPUKernel_Error(Exception):
@@ -145,6 +192,12 @@ def jit(function=None, is_placed=True, use_cache=True):
     def decorator(*args, **kwargs):
         from .kernel import ExternalFunction
 
+        # Check if we already have a compiled kernel for this function signature
+        cache_key = _create_function_cache_key(function, args, kwargs)
+        if cache_key in _compiled_kernels:
+            cached_kernel = _compiled_kernels[cache_key]
+            return cached_kernel(*args, **kwargs)
+
         # Clear any instances from previous runs to make sure if the user provided any broken code we don't try to recompile it
         ExternalFunction._instances.clear()
 
@@ -198,7 +251,7 @@ def decorator(*args, **kwargs):
 
         # Hash of the IR string, ExternalFunction compiler options, and target architecture
         module_hash = hash_module(mlir_module, external_kernels, target_arch)
-        kernel_dir = os.path.join(IRON_CACHE_DIR, f"{module_hash}")
+        kernel_dir = os.path.join(IRON_CACHE_HOME, f"{module_hash}")
         mlir_path = os.path.join(kernel_dir, "aie.mlir")
 
         # Ensure cache directory exists
@@ -238,6 +291,10 @@ def decorator(*args, **kwargs):
         kernel_name = "MLIR_AIE"
         try:
             kernel = NPUKernel(xclbin_path, inst_path, kernel_name=kernel_name)
+
+            # Cache the kernel for this function signature
+            _compiled_kernels[cache_key] = kernel
+
             result = kernel(*args, **kwargs)
             return result
         except Exception as e:
@@ -313,15 +370,14 @@ def hash_module(module, external_kernels=None, target_arch=None):
     """
     mlir_str = str(module)
 
-    # Include ExternalFunction compiler options in the hash
+    # Include ExternalFunction compiler options and source code in the hash
     if external_kernels:
-        compiler_options = []
+        running_hash = ""
+        source_contents = []
         for func in external_kernels:
-            compiler_options.extend(func._include_dirs)
-            compiler_options.extend(func._compile_flags)
+            running_hash += str(hash(func))
 
-        # Create a combined string for hashing
-        combined_str = mlir_str + "|" + "|".join(compiler_options)
+        combined_str = mlir_str + "|" + "|".join(running_hash)
     else:
         combined_str = mlir_str
 
@@ -331,3 +387,52 @@ def hash_module(module, external_kernels=None, target_arch=None):
 
     hash_result = hashlib.sha256(combined_str.encode("utf-8")).hexdigest()[:16]
     return hash_result
+
+
+def _hash_argument(arg, prefix=""):
+    """
+    Helper function to hash supported argument types (tensors and callables).
+    Returns a string representation for cache key generation.
+    """
+    from aie.iron.tensor import Tensor
+    from aie.iron.kernel import ExternalFunction
+
+    if isinstance(arg, Tensor):
+        # Tensor argument - include shape and dtype
+        return f"{prefix}tensor_{arg.shape}_{arg.dtype}"
+    elif isinstance(arg, ExternalFunction):
+        # ExternalFunction argument - use its custom hash method
+        func_hash = hash(arg)
+        return f"{prefix}externalfunction_{func_hash}"
+    elif callable(arg):
+        # Function argument - use hash of function address for uniqueness
+        func_hash = hash(arg)
+        return f"{prefix}function_{func_hash}"
+    else:
+        # Unsupported type - use type name
+        return f"{prefix}{type(arg).__name__}"
+
+
+def _create_function_cache_key(function, args, kwargs):
+    """
+    Create a cache key for a function call based on function name and argument types/shapes.
+    This allows us to cache compiled kernels at the function level.
+    Note that it is not necessary that we cache the tensor shapes since the kernel may be agonstic
+    to the shape changes but we are doing here for safety.
+    """
+    # Get function name
+    func_name = function.__name__
+
+    # Create signature from argument types and shapes
+    signature_parts = []
+
+    for arg in args:
+        result = _hash_argument(arg)
+        signature_parts.append(result)
+
+    for key, value in sorted(kwargs.items()):
+        result = _hash_argument(value, f"{key}_")
+        signature_parts.append(result)
+
+    signature = "_".join(signature_parts)
+    return (func_name, signature)
diff --git a/python/iron/kernel.py b/python/iron/kernel.py
@@ -186,6 +186,34 @@ def resolve(
             # Create the external function
             self._op = external_func(self._name, inputs=self._arg_types)
 
+    def __hash__(self):
+        """
+        Compute a hash for the ExternalFunction based on its properties.
+        This allows ExternalFunction instances to be used in cache keys.
+        """
+        import hashlib
+
+        # Create a string representation of the function's key properties
+        hash_parts = [
+            self._name,
+            str(self._arg_types),
+            str(sorted(self._include_dirs)),
+            str(sorted(self._compile_flags)),
+        ]
+
+        # Include source content for uniqueness
+        # TODO: This solution needs to be extended to handle headers. See https://github.com/Xilinx/mlir-aie/issues/2543
+        if self._source_string:
+            hash_parts.append(self._source_string)
+        elif self._source_file:
+            with open(self._source_file, "r") as f:
+                file_content = f.read()
+            hash_parts.append(file_content)
+
+        # Create hash from combined string
+        combined = "|".join(hash_parts)
+        return int(hashlib.sha256(combined.encode("utf-8")).hexdigest()[:8], 16)
+
     def __call__(self, *args, **kwargs):
         if not self._op:
             raise ValueError("Need to resolve ExternalFunction before it can be called")
diff --git a/python/iron/tensor.py b/python/iron/tensor.py
@@ -484,8 +484,9 @@ def __del__(self):
 
         Releases associated device memory (e.g., XRT buffer object).
         """
-        del self.bo
-        self.bo = None
+        if hasattr(self, "bo"):
+            del self.bo
+            self.bo = None
 
 
 def tensor(data, dtype=np.float32, device="npu"):
diff --git a/python/utils/xrt.py b/python/utils/xrt.py
@@ -74,8 +74,12 @@ def call(self):
         return h
 
     def __del__(self):
-        del self.kernel
-        del self.device
+        if hasattr(self, "kernel"):
+            del self.kernel
+            self.kernel = None
+        if hasattr(self, "device"):
+            del self.device
+            self.device = None
 
 
 # This class wraps up access to the xrt.bo buffer object where sync calls are added
@@ -114,8 +118,9 @@ def sync_from_device(self):
         return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
 
     def __del__(self):
-        del self.bo
-        self.bo = None
+        if hasattr(self, "bo"):
+            del self.bo
+            self.bo = None
 
 
 class AIE_Application_Error(Exception):
diff --git a/test/python/cache_functionality.py b/test/python/cache_functionality.py