intel
diff --git a/‎.github/workflows/clang-tidy.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/clang-tidy.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cmake/imex.cmake‎
Lines changed: 1 addition & 0 deletions b/‎cmake/imex.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/gc/Error.h‎
Lines changed: 59 additions & 0 deletions b/‎include/gc/Error.h‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h‎
Lines changed: 286 additions & 1 deletion b/‎include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h‎
Lines changed: 286 additions & 1 deletion
@@ -14,7 +14,7 @@ jobs:
 
     steps:
     - name: Install OpenMP
-      run: "sudo apt install -y libomp-dev"
+      run: "sudo apt install -y libomp-dev opencl-c-headers"
 
     - name: Fetch sources
       uses: actions/checkout@v4
 
@@ -98,12 +98,15 @@ get_property(GC_TOOLS GLOBAL PROPERTY GC_TOOLS)
 get_property(GC_MLIR_LIBS GLOBAL PROPERTY GC_MLIR_LIBS)
 get_property(GC_PASS_LIBS GLOBAL PROPERTY GC_PASS_LIBS)
 get_property(GC_DIALECT_LIBS GLOBAL PROPERTY GC_DIALECT_LIBS)
+get_property(IMEX_LIBS GLOBAL PROPERTY IMEX_LIBS)
+
 install(TARGETS
   GcInterface
   ${GC_TOOLS}
   ${GC_MLIR_LIBS}
   ${GC_PASS_LIBS}
   ${GC_DIALECT_LIBS}
+  ${IMEX_LIBS}
   EXPORT ${PROJECT_NAME}Targets
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
 
@@ -24,4 +24,5 @@ if (NOT DEFINED IMEX_INCLUDES)
             ${imex_SOURCE_DIR}/src
     )
     set_property(GLOBAL PROPERTY IMEX_INCLUDES ${IMEX_INCLUDES})
+    target_compile_options(GcInterface INTERFACE -DGC_USE_IMEX)
 endif ()
@@ -0,0 +1,59 @@
+//===-- Error.h - Error processing functions --------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GC_ERROR_H
+#define GC_ERROR_H
+
+#include <sstream>
+
+#include "gc/Log.h"
+
+#include "llvm/Support/Error.h"
+
+namespace mlir::gc::err {
+#ifdef _NDEBUG
+#define GC_ERR_LOC_DECL
+#define GC_ERR_LOC_ARGS
+#define GC_ERR_LOC
+#else
+#define GC_ERR_LOC_DECL const char *fileName, int lineNum,
+#define GC_ERR_LOC_ARGS fileName, lineNum,
+#define GC_ERR_LOC __FILE__, __LINE__,
+#endif
+
+#define gcMakeErr(...) mlir::gc::err::makeLlvmError(GC_ERR_LOC __VA_ARGS__)
+#define gcReportErr(...)                                                       \
+  mlir::gc::err::report(GC_ERR_LOC std::move(gcMakeErr(__VA_ARGS__)))
+#define gcGetOrReport(expected) mlir::gc::err::getOrReport(GC_ERR_LOC expected)
+
+template <typename... Args>
+[[nodiscard]] llvm::Error makeLlvmError(GC_ERR_LOC_DECL Args... args) {
+  log::insetLog(GC_ERR_LOC_ARGS std::cerr, "ERROR", args...);
+  std::ostringstream oss;
+  log::insertArgs(oss, args...);
+  auto msg = oss.str();
+  return llvm::make_error<llvm::StringError>(msg.substr(0, msg.length() - 1),
+                                             llvm::inconvertibleErrorCode());
+}
+
+[[noreturn]] static void report(GC_ERR_LOC_DECL llvm::Error err) {
+  log::insetLog(GC_ERR_LOC_ARGS std::cerr, "ERROR",
+                "Unrecoverable error! Aborting...");
+  report_fatal_error(std::move(err));
+}
+
+template <typename T>
+T getOrReport(GC_ERR_LOC_DECL llvm::Expected<T> expected) {
+  if (expected) {
+    return *expected;
+  }
+  report(GC_ERR_LOC_ARGS std::move(expected.takeError()));
+}
+} // namespace mlir::gc::err
+
+#endif
@@ -20,9 +20,294 @@ constexpr char GPU_OCL_MOD_DESTRUCTOR[] = "gcGpuOclModuleDestructor";
 } // namespace mlir::gc::gpu
 
 #ifndef GC_GPU_OCL_CONST_ONLY
+#include <cstdarg>
+#include <unordered_set>
+#include <vector>
 
-// TBD
+#include <CL/cl.h>
 
+#include <llvm/ADT/SmallString.h>
+
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
+#include "mlir/IR/BuiltinOps.h"
+
+namespace mlir::gc::gpu {
+struct OclContext;
+struct OclModule;
+struct OclModuleBuilder;
+
+struct OclRuntime {
+  cl_context context;
+  cl_device_id device;
+
+  // Returns the available Intel GPU device ids.
+  [[nodiscard]] static llvm::Expected<SmallVector<cl_device_id, 2>>
+  gcIntelDevices(size_t max = std::numeric_limits<size_t>::max());
+
+  [[nodiscard]] static llvm::Expected<OclRuntime> get();
+
+  [[nodiscard]] static llvm::Expected<OclRuntime> get(cl_device_id device);
+
+  [[nodiscard]] static llvm::Expected<OclRuntime> get(cl_command_queue queue);
+
+  [[nodiscard]] static llvm::Expected<OclRuntime> get(cl_context context,
+                                                      cl_device_id device);
+
+  static bool isOutOfOrder(cl_command_queue queue);
+
+  [[nodiscard]] llvm::Expected<cl_command_queue>
+  createQueue(bool outOfOrder = false) const;
+
+  [[nodiscard]] llvm::Expected<bool> releaseQueue(cl_command_queue queue) const;
+
+  [[nodiscard]] llvm::Expected<void *> usmAllocDev(size_t size) const;
+
+  [[nodiscard]] llvm::Expected<void *> usmAllocShared(size_t size) const;
+
+  [[nodiscard]] llvm::Expected<bool> usmFree(const void *ptr) const;
+
+  [[nodiscard]] llvm::Expected<bool> usmCpy(OclContext *ctx, const void *src,
+                                            void *dst, size_t size) const;
+
+  [[nodiscard]] llvm::Expected<bool> usmCpy(OclContext &ctx, const void *src,
+                                            void *dst, size_t size) const {
+    return usmCpy(&ctx, src, dst, size);
+  }
+
+  template <typename T>
+  [[nodiscard]] llvm::Expected<T *> usmNewDev(size_t size) const {
+    auto expected = usmAllocDev(size * sizeof(T));
+    if (expected) {
+      return static_cast<T *>(*expected);
+    }
+    return expected.takeError();
+  }
+
+  template <typename T>
+  [[nodiscard]] llvm::Expected<T *> usmNewShared(size_t size) const {
+    auto expected = usmAllocShared(size * sizeof(T));
+    if (expected) {
+      return static_cast<T *>(*expected);
+    }
+    return expected.takeError();
+  }
+
+  template <typename T>
+  [[nodiscard]] llvm::Expected<bool> usmCpy(OclContext &ctx, const T *src,
+                                            T *dst, size_t size) const {
+    return usmCpy(ctx, static_cast<const void *>(src), static_cast<void *>(dst),
+                  size * sizeof(T));
+  }
+
+  // Use with caution! This is safe to check validity of USM, but may be false
+  // positive for any other kinds.
+  bool isUsm(const void *ptr) const;
+
+  bool operator==(const OclRuntime &other) const {
+    return context == other.context && device == other.device;
+  }
+
+private:
+  struct Ext;
+  struct Exports;
+  friend OclContext;
+  friend OclModuleBuilder;
+  explicit OclRuntime(cl_context context, cl_device_id device, const Ext *ext)
+      : context(context), device(device), ext(ext) {}
+  const Ext *ext;
+};
+} // namespace mlir::gc::gpu
+template <> struct std::hash<const mlir::gc::gpu::OclRuntime> {
+  std::size_t
+  operator()(const mlir::gc::gpu::OclRuntime &runtime) const noexcept {
+    return std::hash<cl_context>()(runtime.context) ^
+           std::hash<cl_device_id>()(runtime.device);
+  }
+}; // namespace std
+namespace mlir::gc::gpu {
+
+struct OclContext {
+  cl_command_queue const queue;
+  // Preserve the execution order. This is required in case of out-of-order
+  // execution (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE). When the execution
+  // is completed, the 'lastEvent' field contains the event of the last enqueued
+  // command. If this field is false, 'waitList' is ignored.
+  const bool preserveOrder;
+  cl_event lastEvent;
+
+  explicit OclContext(cl_command_queue queue, cl_uint waitListLen = 0,
+                      cl_event *waitList = nullptr)
+      : OclContext(queue, OclRuntime::isOutOfOrder(queue), waitListLen,
+                   waitList) {}
+
+  explicit OclContext(cl_command_queue queue, bool preserveOrder,
+                      cl_uint waitListLen, cl_event *waitList)
+      : queue(queue), preserveOrder(preserveOrder), lastEvent(nullptr),
+        waitListLen(preserveOrder ? waitListLen : 0),
+        waitList(preserveOrder ? waitList : nullptr), runtime(nullptr),
+        clPtrs(nullptr) {
+    assert(!OclRuntime::isOutOfOrder(queue) || preserveOrder);
+    assert(preserveOrder || (waitListLen == 0 && waitList == nullptr));
+  }
+
+  void finish();
+
+private:
+  friend OclModule;
+  friend OclRuntime;
+  friend OclRuntime::Exports;
+  cl_uint waitListLen;
+  cl_event *waitList;
+  const OclRuntime *runtime;
+  std::unordered_set<void *> *clPtrs;
+
+  void setLastEvent(cl_event event) {
+    lastEvent = event;
+    if (event) {
+      waitListLen = 1;
+      waitList = &lastEvent;
+    } else {
+      waitListLen = 0;
+      waitList = nullptr;
+    }
+  }
+};
+
+struct OclModule {
+  static constexpr int64_t ZERO = 0;
+  static constexpr auto ZERO_PTR = const_cast<int64_t *>(&ZERO);
+
+  // The main function arguments in the following format -
+  // https://mlir.llvm.org/docs/TargetLLVMIR/#c-compatible-wrapper-emission.
+  // Note: the values are not copied, only the pointers are stored!
+  template <unsigned N> struct Args {
+
+    void add(void **alignedPtr, size_t rank, const int64_t *shape,
+             const int64_t *strides, bool isUsm = true) {
+      add(alignedPtr, alignedPtr, ZERO_PTR, rank, shape, strides, isUsm);
+    }
+
+    void add(void **allocatedPtr, void **alignedPtr, const int64_t *offset,
+             size_t rank, const int64_t *shape, const int64_t *strides,
+             bool isUsm = true) {
+#ifndef NDEBUG
+      assert(!isUsm || runtime->isUsm(*alignedPtr));
+      // It's recommended to have at least 16-byte alignment
+      assert(reinterpret_cast<std::uintptr_t>(*alignedPtr) % 16 == 0);
+#endif
+
+      args.emplace_back(allocatedPtr);
+      args.emplace_back(alignedPtr);
+      args.emplace_back(const_cast<int64_t *>(offset));
+      for (size_t i = 0; i < rank; i++) {
+        args.emplace_back(const_cast<int64_t *>(&shape[i]));
+      }
+      for (size_t i = 0; i < rank; i++) {
+        args.emplace_back(const_cast<int64_t *>(&strides[i]));
+      }
+      if (!isUsm) {
+        clPtrs.insert(alignedPtr);
+      }
+    }
+
+    template <typename T>
+    void add(T **alignedPtr, size_t rank, const int64_t *shape,
+             const int64_t *strides, bool isUsm = true) {
+      add(reinterpret_cast<void **>(alignedPtr), rank, shape, strides, isUsm);
+    }
+
+    template <typename T>
+    void add(T **allocatedPtr, T **alignedPtr, const int64_t *offset,
+             size_t rank, const int64_t *shape, const int64_t *strides,
+             bool isUsm = true) {
+      add(reinterpret_cast<void **>(allocatedPtr),
+          reinterpret_cast<void **>(alignedPtr), offset, rank, shape, strides,
+          isUsm);
+    }
+
+    void clear() {
+      args.clear();
+      clPtrs.clear();
+    }
+
+  private:
+    friend OclModule;
+    SmallVector<void *, N + 3> args;
+    // Contains the pointers of all non-USM arguments. It's expected, that the
+    // arguments are either USM or CL pointers and most probably are USM, thus,
+    // in most cases, this set will be empty.
+    std::unordered_set<void *> clPtrs;
+#ifdef NDEBUG
+    explicit Args(){};
+#else
+    const OclRuntime *runtime;
+    explicit Args(const OclRuntime *runtime) : runtime(runtime) {}
+#endif
+  };
+
+  using MainFunc = void (*)(void **);
+
+  explicit OclModule(const OclRuntime &runtime,
+                     std::unique_ptr<ExecutionEngine> engine, MainFunc main)
+      : runtime(runtime), engine(std::move(engine)), main(main) {}
+
+#ifdef NDEBUG
+  template <unsigned N = 64> Args<N> args() const { return Args<N>(); }
+#else
+  template <unsigned N = 64> Args<N> args() const { return Args<N>(&runtime); }
+#endif
+
+  template <unsigned N> void exec(OclContext &ctx, Args<N> &args) const {
+#ifndef NDEBUG
+    auto rt = OclRuntime::get(ctx.queue);
+    assert(rt);
+    assert(*rt == this->runtime);
+#endif
+    auto size = args.args.size();
+    auto ctxPtr = &ctx;
+    ctx.runtime = &runtime;
+    ctx.clPtrs = &args.clPtrs;
+    args.args.emplace_back(&ctxPtr);
+    args.args.emplace_back(&ctxPtr);
+    args.args.emplace_back(ZERO_PTR);
+    main(args.args.data());
+    args.args.truncate(size);
+  }
+
+  ~OclModule();
+  OclModule(const OclModule &) = delete;
+  OclModule &operator=(const OclModule &) = delete;
+  OclModule(const OclModule &&) = delete;
+  OclModule &operator=(const OclModule &&) = delete;
+
+private:
+  OclRuntime runtime;
+  std::unique_ptr<ExecutionEngine> engine;
+  MainFunc main;
+};
+
+struct OclModuleBuilder {
+  friend OclRuntime;
+  explicit OclModuleBuilder(ModuleOp module);
+  explicit OclModuleBuilder(OwningOpRef<ModuleOp> &module)
+      : OclModuleBuilder(module.release()) {}
+
+  llvm::Expected<std::shared_ptr<const OclModule>>
+  build(const OclRuntime &runtime);
+
+  llvm::Expected<std::shared_ptr<const OclModule>>
+  build(cl_command_queue queue);
+
+  llvm::Expected<std::shared_ptr<const OclModule>> build(cl_context context,
+                                                         cl_device_id device);
+
+private:
+  std::shared_mutex mux;
+  ModuleOp mlirModule;
+  SmallString<32> funcName;
+  std::unordered_map<const OclRuntime, std::shared_ptr<const OclModule>> cache;
+};
+}; // namespace mlir::gc::gpu
 #else
 #undef GC_GPU_OCL_CONST_ONLY
 #endif
Original file line number	Diff line number	Diff line change
`@@ -24,4 +24,5 @@ if (NOT DEFINED IMEX_INCLUDES)`
`24`	`24`	`${imex_SOURCE_DIR}/src`
`25`	`25`	`)`
`26`	`26`	`set_property(GLOBAL PROPERTY IMEX_INCLUDES ${IMEX_INCLUDES})`
	`27`	`+ target_compile_options(GcInterface INTERFACE -DGC_USE_IMEX)`
`27`	`28`	`endif ()`