Add zeModuleDestroy in struct destructor

nbpatel · silee2 · commit 4d4d95153e68 · 2023-03-08T09:54:11.000-08:00
Cache Spirv Module
diff --git a/lib/ExecutionEngine/SYCLRUNTIME/SyclRuntimeWrappers.cpp b/lib/ExecutionEngine/SYCLRUNTIME/SyclRuntimeWrappers.cpp
@@ -26,6 +26,8 @@
 
 #include <CL/sycl.hpp>
 #include <level_zero/ze_api.h>
+#include <map>
+#include <mutex>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
 
 #ifdef _WIN32
@@ -62,6 +64,21 @@ template <typename F> auto catchAll(F &&func) {
 
 } // namespace
 
+struct SpirvModule {
+  ze_module_handle_t module = nullptr;
+  ~SpirvModule();
+};
+
+namespace {
+// Create a Map for the spirv module lookup
+std::map<void *, SpirvModule> moduleCache;
+std::mutex mutexLock;
+} // namespace
+
+SpirvModule::~SpirvModule() {
+  L0_SAFE_CALL(zeModuleDestroy(SpirvModule::module));
+}
+
 struct ParamDesc {
   void *data;
   size_t size;
@@ -153,6 +170,13 @@ static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data,
   assert(data);
   auto syclQueue = queue->syclQueue_;
   ze_module_handle_t zeModule;
+
+  auto it = moduleCache.find((void *)data);
+  // Check the map if the module is present/cached.
+  if (it != moduleCache.end()) {
+    return it->second.module;
+  }
+
   ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
                            nullptr,
                            ZE_MODULE_FORMAT_IL_SPIRV,
@@ -165,6 +189,8 @@ static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data,
   auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
       syclQueue.get_context());
   L0_SAFE_CALL(zeModuleCreate(zeContext, zeDevice, &desc, &zeModule, nullptr));
+  std::lock_guard<std::mutex> entryLock(mutexLock);
+  moduleCache[(void *)data].module = zeModule;
   return zeModule;
 }
 
@@ -177,8 +203,8 @@ static sycl::kernel *getKernel(GPUSYCLQUEUE *queue, ze_module_handle_t zeModule,
   sycl::kernel *syclKernel;
   ze_kernel_desc_t desc = {};
   desc.pKernelName = name;
-  L0_SAFE_CALL(zeKernelCreate(zeModule, &desc, &zeKernel));
 
+  L0_SAFE_CALL(zeKernelCreate(zeModule, &desc, &zeKernel));
   sycl::kernel_bundle<sycl::bundle_state::executable> kernelBundle =
       sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero,
                                sycl::bundle_state::executable>(
diff --git a/test/PlaidML/CppEdsl.Add.mlir b/test/PlaidML/CppEdsl.Add.mlir
@@ -2,11 +2,11 @@
 // RUN:                                       --runner imex-cpu-runner -e main \
 // RUN:                                       --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \
 // RUN:                                       --entry-point-result=void --filecheck
-// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm-caching.pp \
 // RUN:                                       --runner imex-cpu-runner -e main \
 // RUN:                                       --entry-point-result=void \
 // RUN:                                       --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
-// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm-caching.pp \
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
@@ -15,9 +15,14 @@ module @add {
 func.func @main() {
     %0= arith.constant dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>:tensor<3x3xi64>
     %1= arith.constant dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>:tensor<3x3xi64>
-    %2 = call @test(%0,%1) : (tensor<3x3xi64>,tensor<3x3xi64>) -> tensor<3x3xi64>
-    %unranked = tensor.cast %2 : tensor<3x3xi64>to tensor<*xi64>
-    call @printMemrefI64(%unranked) : (tensor<*xi64>) -> ()
+    %lb = arith.constant 0 : index
+    %ub = arith.constant 100 : index
+    %step = arith.constant 1 : index
+    scf.for %temp = %lb to %ub step %step {
+      %2 = func.call @test(%0,%1) : (tensor<3x3xi64>,tensor<3x3xi64>) -> tensor<3x3xi64>
+      %unranked = tensor.cast %2 : tensor<3x3xi64> to tensor<*xi64>
+      func.call @printMemrefI64(%unranked) : (tensor<*xi64>) -> ()
+    }
     return
 }
 func.func private @printMemrefI64(tensor<*xi64>)
diff --git a/test/PlaidML/OpTest.GEMM_FLOAT32.mlir b/test/PlaidML/OpTest.GEMM_FLOAT32.mlir
@@ -2,11 +2,11 @@
 // RUN:                                       --runner imex-cpu-runner -e main \
 // RUN:                                       --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \
 // RUN:                                       --entry-point-result=void --filecheck
-// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm-caching.pp \
 // RUN:                                       --runner imex-cpu-runner -e main \
 // RUN:                                       --entry-point-result=void \
 // RUN:                                       --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
-// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm.pp \
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/linalg-to-llvm-caching.pp \
 // RUN:                                        --runner imex-cpu-runner -e main \
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
@@ -18,9 +18,14 @@ func.func @main() {
     %0= arith.constant dense<[[0.5, 0.2, 4.0], [1.0, 1.0, 2.0], [3.0, 3.0, 0.3]]>:tensor<3x3xf32>
     %1= arith.constant dense<[[1.0, 2.0, 3.0], [3.0, 4.0, 0.5], [3.0, 3.0, 3.0]]>:tensor<3x3xf32>
     %2= arith.constant dense<[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]>:tensor<3x3xf32>
-    %3 = call @test(%0,%1,%2) : (tensor<3x3xf32>,tensor<3x3xf32>,tensor<3x3xf32>) -> tensor<3x3xf32>
-    %unranked = tensor.cast %3 : tensor<3x3xf32>to tensor<*xf32>
-    call @printMemrefF32(%unranked) : (tensor<*xf32>) -> ()
+    %lb = arith.constant 0 : index
+    %ub = arith.constant 100 : index
+    %step = arith.constant 1 : index
+    scf.for %temp = %lb to %ub step %step {
+      %3 = func.call @test(%0,%1,%2) : (tensor<3x3xf32>,tensor<3x3xf32>,tensor<3x3xf32>) -> tensor<3x3xf32>
+      %unranked = tensor.cast %3 : tensor<3x3xf32> to tensor<*xf32>
+      func.call @printMemrefF32(%unranked) : (tensor<*xf32>) -> ()
+    }
     // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}}
     // CHECK-NEXT: [14.1,   14.8,   14.6]
     // CHECK-NEXT: [11,   13,   10.5]
diff --git a/test/PlaidML/linalg-to-llvm-caching.pp b/test/PlaidML/linalg-to-llvm-caching.pp
@@ -0,0 +1,48 @@
+// linalg dialect to gpu dialect lowering pipeline
+// Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect.
+builtin.module(convert-tensor-to-linalg
+    arith-bufferize
+    func.func(empty-tensor-to-alloc-tensor
+          eliminate-empty-tensors
+          scf-bufferize
+          shape-bufferize
+          linalg-bufferize
+          bufferization-bufferize
+          tensor-bufferize)
+    func-bufferize
+    func.func(finalizing-bufferize
+          convert-linalg-to-parallel-loops
+          gpu-map-parallel-loops
+          convert-parallel-loops-to-gpu)
+// insert-gpu-allocs pass can have client-api = opencl or vulkan args
+    func.func(insert-gpu-allocs{client-api=opencl})
+    canonicalize
+    normalize-memrefs
+// Unstride memrefs does not seem to be needed.
+//  func.func(unstride-memrefs)
+    func.func(lower-affine)
+    gpu-kernel-outlining
+    canonicalize
+    cse
+// The following set-spirv-* passes can have client-api = opencl or vulkan args
+    set-spirv-capabilities{client-api=opencl}
+    gpu.module(set-spirv-abi-attrs{client-api=opencl})
+    canonicalize
+    fold-memref-alias-ops
+    imex-convert-gpu-to-spirv
+    spirv.module(spirv-lower-abi-attrs
+             spirv-update-vce)
+    func.func(llvm-request-c-wrappers)
+    serialize-spirv
+    convert-gpu-to-gpux
+    convert-scf-to-cf
+    convert-cf-to-llvm
+    convert-arith-to-llvm
+    convert-func-to-llvm
+    convert-math-to-llvm
+    convert-gpux-to-llvm
+    expand-strided-metadata
+    lower-affine
+    convert-memref-to-llvm
+    reconcile-unrealized-casts)
+// End