simplify allocation: context manager returns input memrefs

tkarna · tkarna · commit ed39be1a8bb4 · 2025-11-27T11:20:18.000+02:00
diff --git a/python/examples/xegpu_matmul/matmul.py b/python/examples/xegpu_matmul/matmul.py
@@ -92,11 +92,6 @@ def _allocate_array(
         self.gpu_memrefs[key] = mref
         return mref
 
-    def _allocate_inputs(self, execution_engine: ExecutionEngine):
-        self._allocate_array("A", self.a_shape, self.ab_type, execution_engine)
-        self._allocate_array("B", self.b_shape, self.ab_type, execution_engine)
-        self._allocate_array("C", self.c_shape, self.c_type, execution_engine)
-
     def _deallocate_all(self, execution_engine: ExecutionEngine):
         for (_, dtype_str), mref in self.gpu_memrefs.items():
             dealloc_func = execution_engine.lookup("gpu_dealloc_" + dtype_str)
@@ -105,10 +100,10 @@ def _deallocate_all(self, execution_engine: ExecutionEngine):
         self.gpu_memrefs = {}
 
     @contextmanager
-    def allocate(self, execution_engine: ExecutionEngine):
+    def allocate_inputs(self, execution_engine: ExecutionEngine):
         try:
-            self._allocate_inputs(execution_engine)
-            yield None
+            inputs = self._get_input_arrays(execution_engine)
+            yield inputs
         finally:
             self._deallocate_all(execution_engine)
 
@@ -141,7 +136,7 @@ def _reference_solution(self) -> np.ndarray:
             raise NotImplementedError("Bias verification not implemented")
         return C_ref
 
-    def get_input_arrays(
+    def _get_input_arrays(
         self, execution_engine: ExecutionEngine
     ) -> list[ctypes.Structure]:
         A_gpu = self._allocate_array("A", self.a_shape, self.ab_type, execution_engine)
diff --git a/python/examples/xegpu_matmul/runner.py b/python/examples/xegpu_matmul/runner.py
@@ -73,9 +73,8 @@ def execute(
     # get execution engine
     engine = get_engine(payload_module, requirements=workload.requirements())
 
-    with workload.allocate(execution_engine=engine):
+    with workload.allocate_inputs(execution_engine=engine) as inputs:
         # prepare function arguments
-        inputs = workload.get_input_arrays(execution_engine=engine)
         pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs]
         packed_args = get_packed_arg(pointers)
 
@@ -150,8 +149,7 @@ def benchmark(*args):
     # get execution engine, rtclock requires mlir_c_runner
     engine = get_engine(payload_module)
 
-    with workload.allocate(execution_engine=engine):
-        inputs = workload.get_input_arrays(execution_engine=engine)
+    with workload.allocate_inputs(execution_engine=engine) as inputs:
         pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs]
         if check_correctness:
             # call payload once to verify correctness