[Profiling] add profiling code to get kernel submit/execute time (#548)

Dewei-Wang-sh · web-flow · commit 7e4a623b0b1f · 2022-12-14T08:34:04.000+08:00
* add tracer benchmark usage
diff --git a/README.md b/README.md
@@ -173,3 +173,18 @@ Add '-v' to the above command-line to get verbose output.
 ## License
 This code is made available under the Apache License 2.0 with LLVM Exceptions.
 See the `LICENSE.txt` file for more details.
+
+## Profiling kernel execute time
+### sycl event
+```sh
+export IMEX_ENABLE_PROFILING=ON
+run the test
+```
+### trace tools
+```sh
+python {your_path}/imex_runner.py xxx -o test.mlir
+mlir-translate test.mlir -mlir-to-llvmir -o test.ll
+llc test.ll -filetype=obj -o test.o
+clang++ test.o {path}/libmlir_runner_utils.so {path}/libmlir_c_runner_utils.so {path}/libsycl-runtime.so -no-pie -o test
+ze_tracer ./test
+```
diff --git a/include/imex/Transforms/Passes.td b/include/imex/Transforms/Passes.td
@@ -46,6 +46,7 @@ def InsertGPUAllocs : Pass<"insert-gpu-allocs", "::mlir::func::FuncOp"> {
 def SetSPIRVCapabilities : Pass<"set-spirv-capabilities"> {
   let summary = "Sets Spirv capabilities";
   let constructor = "imex::createSetSPIRVCapabilitiesPass()";
+  let dependentDialects = ["::mlir::spirv::SPIRVDialect"];
   let options = [
     Option<"clientAPI", "client-api", "std::string", /*default=*/"\"opencl\"",
            "The client API to use for setting Spirv capabilities">
diff --git a/lib/ExecutionEngine/SYCLRUNTIME/SyclRuntimeWrappers.cpp b/lib/ExecutionEngine/SYCLRUNTIME/SyclRuntimeWrappers.cpp
@@ -99,30 +99,31 @@ struct GPUSYCLQUEUE {
   sycl::context syclContext_;
   sycl::queue syclQueue_;
 
-  GPUSYCLQUEUE() {
+  GPUSYCLQUEUE(sycl::property_list propList) {
 
     syclDevice_ = getDefaultDevice();
     syclContext_ = sycl::context(syclDevice_);
-    syclQueue_ = sycl::queue(syclContext_, syclDevice_);
+    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
   }
 
-  GPUSYCLQUEUE(sycl::device *device, sycl::context *context) {
+  GPUSYCLQUEUE(sycl::device *device, sycl::context *context,
+               sycl::property_list propList) {
     syclDevice_ = *device;
     syclContext_ = *context;
-    syclQueue_ = sycl::queue(syclContext_, syclDevice_);
+    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
   }
-  GPUSYCLQUEUE(sycl::device *device) {
+  GPUSYCLQUEUE(sycl::device *device, sycl::property_list propList) {
 
     syclDevice_ = *device;
     syclContext_ = sycl::context(syclDevice_);
-    syclQueue_ = sycl::queue(syclContext_, syclDevice_);
+    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
   }
 
-  GPUSYCLQUEUE(sycl::context *context) {
+  GPUSYCLQUEUE(sycl::context *context, sycl::property_list propList) {
 
     syclDevice_ = getDefaultDevice();
     syclContext_ = *context;
-    syclQueue_ = sycl::queue(syclContext_, syclDevice_);
+    syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList);
   }
 
 }; // end of GPUSYCLQUEUE
@@ -197,32 +198,49 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
 
   auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
 
-  syclQueue.submit([&](sycl::handler &cgh) {
+  sycl::event event = syclQueue.submit([&](sycl::handler &cgh) {
     for (size_t i = 0; i < paramsCount; i++) {
       auto param = params[i];
       cgh.set_arg(static_cast<uint32_t>(i),
                   *(static_cast<void **>(param.data)));
     }
     cgh.parallel_for(syclNdRange, *kernel);
   });
+  if (getenv("IMEX_ENABLE_PROFILING")) {
+    // auto submitTime = event.get_profiling_info<
+    //     cl::sycl::info::event_profiling::command_submit>();
+    auto startTime = event.get_profiling_info<
+        cl::sycl::info::event_profiling::command_start>();
+    auto endTime =
+        event
+            .get_profiling_info<cl::sycl::info::event_profiling::command_end>();
+    // auto submissionTime = float(startTime - submitTime) / 1000000.0f;
+    // fprintf(stdout, "the kernel submission time is %f ms\n", submissionTime);
+    auto executionTime = float(endTime - startTime) / 1000000.0f;
+    fprintf(stdout, "the kernel execution time is %f ms\n", executionTime);
+  }
 }
 
 // Wrappers
 
 extern "C" SYCL_RUNTIME_EXPORT GPUSYCLQUEUE *gpuCreateStream(void *device,
                                                              void *context) {
+  auto propList = sycl::property_list{};
+  if (getenv("IMEX_ENABLE_PROFILING")) {
+    propList = sycl::property_list{sycl::property::queue::enable_profiling()};
+  }
   return catchAll([&]() {
     if (!device && !context) {
-      return new GPUSYCLQUEUE();
+      return new GPUSYCLQUEUE(propList);
     } else if (device && context) {
       // TODO: Check if the pointers/address is valid and holds the correct
       // device and context
       return new GPUSYCLQUEUE(static_cast<sycl::device *>(device),
-                              static_cast<sycl::context *>(context));
+                              static_cast<sycl::context *>(context), propList);
     } else if (device && !context) {
-      return new GPUSYCLQUEUE(static_cast<sycl::device *>(device));
+      return new GPUSYCLQUEUE(static_cast<sycl::device *>(device), propList);
     } else {
-      return new GPUSYCLQUEUE(static_cast<sycl::context *>(context));
+      return new GPUSYCLQUEUE(static_cast<sycl::context *>(context), propList);
     }
   });
 }
diff --git a/tools/imex-runner/imex-runner.py.in b/tools/imex-runner/imex-runner.py.in
@@ -73,6 +73,7 @@ parser = argparse.ArgumentParser(
     description="Run imex-opt, optionally pipe result into selected mlir runner (default: mlir-cpu-runner) and then optionally pipe output into FileCheck"
 )
 parser.add_argument("--input-file", "-i", default=None, help="input MLIR file")
+parser.add_argument("--output-file", "-o", default=None, help="output MLIR file")
 parser.add_argument("--pass-pipeline-file", "-f", default=None, help="file defining pass pipeline")
 parser.add_argument("--pass-pipeline", "-p", default=None, help="pass pipeline (string)")
 parser.add_argument("--imex-print-before-all", "-b", action='store_true', dest='before', help="print ir before all passes")
@@ -165,6 +166,12 @@ if args.after:
     cmd.append(f'--mlir-print-ir-after-all')
 cmds.append(cmd)
 
+# output to a file
+if args.output_file:
+    cmd=['tee']
+    cmd.append(args.output_file)
+cmds.append(cmd)
+
 # build runner command
 if not args.no_mlir_runner:
     # build runner command: all unknown args will be passed to the runner