SPIRV Runner: User driven generic SYCL output buffer copy (D2H) (#2954)

kballeda · web-flow · commit 077366800104 · 2024-12-10T19:51:14.000+05:30
This PR conditionally copies the output buffers (buffer_cnt &gt;= 0) based
on the CLI option (-o) with tensor names (comma separated). Previously
we used to support single output tensor copy.
diff --git a/utils/SPIRVRunner/SPIRVRunner.cpp b/utils/SPIRVRunner/SPIRVRunner.cpp
@@ -47,6 +47,12 @@ auto read_spirv(const std::string &filename) {
   return read_file_as_bytes(filename);
 }
 
+// Host output tensor buffers and indexes
+struct TensorBuffer {
+  torch::Tensor buffer_ptr;
+  size_t index;
+};
+
 // Structure that contains Triton kernel arguments
 struct KernelArguments {
   int gridX;
@@ -62,11 +68,11 @@ struct KernelArguments {
   std::string spv_name;
   ordered_json jsonData;
   std::vector<char *> dev_buffers;
-  torch::Tensor host_outbuffer;
-  std::string out_tensor_name;
+  std::vector<TensorBuffer> host_outbuffers;
+  std::vector<std::string> out_tensor_names;
   std::string spirv_dump_dir;
 
-  KernelArguments(const std::string &outtensorname) {
+  KernelArguments(const std::vector<std::string> &outtensornames) {
     // Check if the triton_xpu_dump path exists if not point to current
     // directory
     auto env_path = std::getenv("TRITON_XPU_DUMP_SPIRV_KERNEL_ARGS");
@@ -97,7 +103,7 @@ struct KernelArguments {
     build_flags = jsonData.at("build_flags");
     spv_name =
         spirv_dump_dir + "/" + jsonData.at("spv_name").get<std::string>();
-    out_tensor_name = outtensorname;
+    out_tensor_names = outtensornames;
   }
 };
 
@@ -348,13 +354,14 @@ at::TensorOptions getTensorOptions(const std::string &dtype) {
   }
 }
 
-at::Tensor launchKernel(sycl::queue stream, sycl::kernel kernel,
-                        KernelArguments triton_args, bool get_kernel_time) {
+std::vector<TensorBuffer> launchKernel(sycl::queue stream, sycl::kernel kernel,
+                                       KernelArguments triton_args,
+                                       bool get_kernel_time) {
 
   auto tensor_ptr = [](const torch::Tensor &t) -> void * {
     return static_cast<void *>(t.data_ptr());
   };
-  int devout_idx = 0;
+
   for (auto &item : triton_args.jsonData["argument_list"]) {
     if (item.contains("type")) {
       if (item.at("type").get<std::string>() == "tensor") {
@@ -369,36 +376,40 @@ at::Tensor launchKernel(sycl::queue stream, sycl::kernel kernel,
             .wait_and_throw();
 
         // Configure output tensor
-        if (item.at("name").get<std::string>() == triton_args.out_tensor_name) {
-          devout_idx = triton_args.dev_buffers.size() - 1;
-          triton_args.host_outbuffer = torch::zeros(
-              {tensor.sizes()}, getTensorOptions(item.at("dtype")));
-          std::cout << "Tensor output: " << triton_args.host_outbuffer.sizes()
-                    << ", " << triton_args.host_outbuffer.scalar_type() << " ("
-                    << triton_args.host_outbuffer.nbytes() << " bytes)"
-                    << std::endl;
+        if (std::find(triton_args.out_tensor_names.begin(),
+                      triton_args.out_tensor_names.end(),
+                      item.at("name").get<std::string>()) !=
+            triton_args.out_tensor_names.end()) {
+          TensorBuffer tb;
+          tb.buffer_ptr = torch::zeros({tensor.sizes()},
+                                       getTensorOptions(item.at("dtype")));
+          tb.index = triton_args.dev_buffers.size() - 1;
+          triton_args.host_outbuffers.push_back(tb);
+          std::cout
+              << "Tensor output[" << triton_args.host_outbuffers.back().index
+              << "]: " << triton_args.host_outbuffers.back().buffer_ptr.sizes()
+              << ", "
+              << triton_args.host_outbuffers.back().buffer_ptr.scalar_type()
+              << " (" << triton_args.host_outbuffers.back().buffer_ptr.nbytes()
+              << " bytes)" << std::endl;
         }
       }
     } else {
       throw std::runtime_error("Type entry is missing in JSON argument_list");
     }
   }
 
-  if (!triton_args.host_outbuffer.defined()) {
-    std::string message = "Output tensor isn't configured; \
-        the second positional parameter is ";
-    throw std::runtime_error(message + triton_args.out_tensor_name);
-  }
-
   // Launch SYCL kernel
   sycl_kernel_launch(stream, kernel, triton_args, get_kernel_time);
 
-  // copy back
-  stream
-      .memcpy(tensor_ptr(triton_args.host_outbuffer),
-              triton_args.dev_buffers.at(devout_idx),
-              triton_args.host_outbuffer.nbytes())
-      .wait_and_throw();
+  // copy back the output tensors
+  for (const auto &item : triton_args.host_outbuffers) {
+    stream
+        .memcpy(tensor_ptr(item.buffer_ptr),
+                triton_args.dev_buffers.at(item.index),
+                item.buffer_ptr.nbytes())
+        .wait_and_throw();
+  }
 
   for (auto *dev_ptr : triton_args.dev_buffers) {
     if (dev_ptr)
@@ -407,7 +418,7 @@ at::Tensor launchKernel(sycl::queue stream, sycl::kernel kernel,
       throw std::runtime_error("sycl::free failed \n");
   }
 
-  return triton_args.host_outbuffer;
+  return triton_args.host_outbuffers;
 }
 
 int main(int argc, char **argv) {
@@ -430,7 +441,7 @@ int main(int argc, char **argv) {
     initDevices(&q);
 
     // Parse the JSON file and create argument dictionary
-    KernelArguments tritonArgDict(cliopts.output_tensor);
+    KernelArguments tritonArgDict(cliopts.output_tensors);
 
     // read spirv
     auto spirv = read_spirv(tritonArgDict.spv_name);
@@ -444,12 +455,16 @@ int main(int argc, char **argv) {
     std::cout << "Loaded kernel with " << n_regs << " registers and "
               << n_spills << " register spills." << std::endl;
 
-    auto output =
+    auto output_tensors =
         launchKernel(q, kernel, tritonArgDict, cliopts.get_kernel_time);
 
-    auto output_tensor = tritonArgDict.spirv_dump_dir + "/cpp_outs.pt";
-    write_tensor(output_tensor, output);
-    std::cout << "Output Tensor Path: " << output_tensor << std::endl;
+    // Write output tensors to file
+    for (auto &item : output_tensors) {
+      auto output_tensor = tritonArgDict.spirv_dump_dir + "/cpp_outs_" +
+                           std::to_string(item.index) + ".pt";
+      write_tensor(output_tensor, item.buffer_ptr);
+      std::cout << "Output Tensor Path: " << output_tensor << std::endl;
+    }
   } catch (const std::runtime_error &e) {
     std::cerr << "Error: " << e.what() << std::endl;
     return EXIT_FAILURE;
diff --git a/utils/SPIRVRunner/llvm_parser.cpp b/utils/SPIRVRunner/llvm_parser.cpp
@@ -5,15 +5,18 @@ command_line_parser::command_line_parser(int argc, char **argv)
 
 command_line_parser::options command_line_parser::parse() {
   options opts;
-  llvm::cl::opt<std::string> output_tensor(
-      "o", llvm::cl::desc("<Specify Output Tensor Name>"), llvm::cl::Required);
+  llvm::cl::list<std::string> output_tensors(
+      "o",
+      llvm::cl::desc(
+          "<Specify Output Tensor Names (Ex: -o tensor_1,tensor_2 or skip)>"),
+      llvm::cl::CommaSeparated);
   llvm::cl::opt<bool> enable_profiling(
       "p", llvm::cl::desc("Enable kernel time profiling"),
       llvm::cl::init(opts.get_kernel_time));
 
   llvm::cl::ParseCommandLineOptions(argc, argv, "SPIRVRunner\n");
 
-  opts.output_tensor = output_tensor;
+  opts.output_tensors.assign(output_tensors.begin(), output_tensors.end());
   opts.get_kernel_time = enable_profiling;
 
   return opts;
diff --git a/utils/SPIRVRunner/llvm_parser.h b/utils/SPIRVRunner/llvm_parser.h
@@ -6,7 +6,7 @@
 class command_line_parser {
 public:
   struct options {
-    std::string output_tensor;
+    std::vector<std::string> output_tensors;
     bool get_kernel_time = false;
   };