Added test

AndreyPavlenko · AndreyPavlenko · commit d8daf5710afc · 2024-09-20T17:23:14.000+02:00
diff --git a/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h
@@ -299,10 +299,8 @@ template <unsigned N> struct OclModuleExecutorBase {
     assert(argCounter == mod->functionType.getNumInputs());
   }
 
-  void checkArg(void *alignedPtr, bool isUsm = true) const {
+  void checkArg(const void *alignedPtr, bool isUsm = true) const {
     assert(!isUsm || mod->runtime.isUsm(alignedPtr));
-    // It's recommended to have at least 16-byte alignment
-    assert(reinterpret_cast<std::uintptr_t>(alignedPtr) % 16 == 0);
   }
 #endif
 };
diff --git a/test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp b/test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp
@@ -19,9 +19,6 @@
 #include "gtest/gtest.h"
 #include <memory>
 
-#include <mlir/Dialect/GPU/Transforms/Passes.h>
-
-#include "gc/Transforms/Passes.h"
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include <CL/cl_ext.h>
@@ -59,40 +56,69 @@ module @test {
 }
 )mlir";
 
-template <unsigned N, unsigned M = N> struct TestAdd {
+constexpr char matmulAddStatic[] = R"mlir(
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @entry(%arg0: memref<64x128xf16>, %arg1: memref<128x128xf16>, %arg2: memref<64x128xf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<64x128xf16>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<128x128xf16>
+    %2 = tensor.empty() : tensor<64x128xf16>
+    %cst = arith.constant 0.000000e+00 : f16
+    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<64x128xf16>) -> tensor<64x128xf16>
+    %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<64x128xf16>, tensor<128x128xf16>) outs(%3 : tensor<64x128xf16>) -> tensor<64x128xf16>
+    %5 = tensor.empty() : tensor<64x128xf16>
+    %6 = linalg.add ins(%4, %0 : tensor<64x128xf16>, tensor<64x128xf16>) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16>
+    bufferization.materialize_in_destination %6 in restrict writable %arg2 : (tensor<64x128xf16>, memref<64x128xf16>) -> ()
+    return
+  }
+}
+)mlir";
+
+struct TestBase {
   OclRuntime runtime = gcGetOrReport(OclRuntime::get());
   cl_command_queue queue = gcGetOrReport(runtime.createQueue());
+  OclContext ctx{runtime, queue};
+  MLIRContext mlirCtx{gc::initCompilerAndGetDialects()};
+
+  virtual void exec(std::shared_ptr<const OclModule> &mod, OclContext &ctx) = 0;
+
+  virtual ~TestBase() { gcGetOrReport(runtime.releaseQueue(queue)); }
+
+  OwningOpRef<ModuleOp> parse(const char *code) {
+    std::unique_ptr<llvm::MemoryBuffer> memBuf =
+        llvm::MemoryBuffer::getMemBuffer(code);
+    llvm::SourceMgr srcMgr;
+    srcMgr.AddNewSourceBuffer(std::move(memBuf), SMLoc());
+    return parseSourceFile<ModuleOp>(srcMgr, &mlirCtx);
+  }
+};
 
+template <unsigned N, unsigned M = N> struct TestAdd : TestBase {
   static constexpr unsigned size = N * M;
   float *buf0 = gcGetOrReport(runtime.usmNewDev<float>(size));
   float *buf1 = gcGetOrReport(runtime.usmNewDev<float>(size));
   float *buf2 = gcGetOrReport(runtime.usmNewShared<float>(size));
-  MLIRContext mlirCtx{gc::initCompilerAndGetDialects()};
-  float cpuBuf1[size] = {};
-  float cpuBuf2[size] = {};
 
-  explicit TestAdd() { std::fill(cpuBuf1, cpuBuf1 + size, 2.0f); }
+  explicit TestAdd() {
+    float cpuBuf[size];
+    std::fill(cpuBuf, cpuBuf + size, 2.0f);
+    assert(runtime.usmCpy(ctx, cpuBuf, buf0, size));
+    assert(runtime.usmCpy(ctx, cpuBuf, buf1, size));
+  }
 
-  virtual ~TestAdd() {
-    gcGetOrReport(runtime.releaseQueue(queue));
+  ~TestAdd() override {
     assert(runtime.usmFree(buf0));
     assert(runtime.usmFree(buf1));
     assert(runtime.usmFree(buf2));
+    TestBase::~TestBase();
   }
 
-  virtual void exec(std::shared_ptr<const OclModule> &mod, OclContext &ctx) = 0;
-
   void test(const char *code) {
-    OclContext ctx(runtime, queue);
-    assert(runtime.usmCpy(ctx, cpuBuf1, buf0, size));
-    assert(runtime.usmCpy(ctx, cpuBuf1, buf1, size));
-
     OclModuleBuilder builder(parse(code));
     auto mod = gcGetOrReport(builder.build(runtime));
-
     exec(mod, ctx);
 
-    assert(runtime.usmCpy(ctx, buf2, cpuBuf2, size));
+    float cpuBuf[size];
+    assert(runtime.usmCpy(ctx, buf2, cpuBuf, size));
     gcGetOrReport(ctx.finish());
 
     for (unsigned i = 0; i < size; i++) {
@@ -101,18 +127,46 @@ template <unsigned N, unsigned M = N> struct TestAdd {
     }
     // std::cout << "\n";
 
-    for (float i : cpuBuf2) {
+    for (float i : cpuBuf) {
       // std::cout << cpuBuf2[i] << " ";
       assert(i == 4.0f);
     }
   }
+};
 
-  OwningOpRef<ModuleOp> parse(const char *code) {
-    std::unique_ptr<llvm::MemoryBuffer> memBuf =
-        llvm::MemoryBuffer::getMemBuffer(code);
-    llvm::SourceMgr srcMgr;
-    srcMgr.AddNewSourceBuffer(std::move(memBuf), SMLoc());
-    return parseSourceFile<ModuleOp>(srcMgr, &mlirCtx);
+template <unsigned N, unsigned M = N> struct TestMatmulAdd : TestBase {
+  static constexpr unsigned size1 = N * M;
+  static constexpr unsigned size2 = M * M;
+  cl_half *buf0 = gcGetOrReport(runtime.usmNewDev<cl_half>(size1));
+  cl_half *buf1 = gcGetOrReport(runtime.usmNewDev<cl_half>(size2));
+  cl_half *buf2 = gcGetOrReport(runtime.usmNewShared<cl_half>(size1));
+
+  explicit TestMatmulAdd() {
+    cl_half cpuBuf[size2];
+    std::fill(cpuBuf, cpuBuf + size2, 2.0f);
+    assert(runtime.usmCpy(ctx, cpuBuf, buf0, size1));
+    assert(runtime.usmCpy(ctx, cpuBuf, buf1, size2));
+  }
+
+  ~TestMatmulAdd() override {
+    assert(runtime.usmFree(buf0));
+    assert(runtime.usmFree(buf1));
+    assert(runtime.usmFree(buf2));
+    TestBase::~TestBase();
+  }
+
+  void test(const char *code) {
+    OclModuleBuilder builder(parse(code));
+    auto mod = gcGetOrReport(builder.build(runtime));
+    exec(mod, ctx);
+
+    gcGetOrReport(ctx.finish());
+
+    for (unsigned i = 0; i < size2; i++) {
+      // std::cout << buf2[i] << " ";
+      assert(buf2[i] == 4.0f);
+    }
+    // std::cout << "\n";
   }
 };
 
@@ -143,6 +197,19 @@ TEST(GpuOclRuntime, TestAddStatic) {
   test2.test(addStatic);
 }
 
+TEST(GpuOclRuntime, TestMatmulAddStatic) {
+  GTEST_SKIP() << "Temporary disabled until #344 is implemented";
+  struct Test : TestMatmulAdd<64, 128> {
+    void exec(std::shared_ptr<const OclModule> &mod, OclContext &ctx) override {
+      assert(mod->isStatic);
+      StaticExecutor<3> exec(mod);
+      exec(ctx, buf0, buf1, buf2);
+      assert(exec.isSmall());
+    }
+  } test;
+  test.test(matmulAddStatic);
+}
+
 TEST(GpuOclRuntime, TestAddDynamic) {
   GTEST_SKIP() << "Dynamic shapes are not yet supported";
   struct TestAddDynamic : TestAdd<32, 64> {

Original file line number	Diff line number	Diff line change
`@@ -299,10 +299,8 @@ template <unsigned N> struct OclModuleExecutorBase {`
`299`	`299`	`assert(argCounter == mod->functionType.getNumInputs());`
`300`	`300`	`}`
`301`	`301`
`302`		`- void checkArg(void *alignedPtr, bool isUsm = true) const {`
	`302`	`+ void checkArg(const void *alignedPtr, bool isUsm = true) const {`
`303`	`303`	`assert(!isUsm \|\| mod->runtime.isUsm(alignedPtr));`
`304`		`- // It's recommended to have at least 16-byte alignment`
`305`		`- assert(reinterpret_cast<std::uintptr_t>(alignedPtr) % 16 == 0);`
`306`	`304`	`}`
`307`	`305`	`#endif`
`308`	`306`	`};`