intel
diff --git a/‎build_tools/patches/0014-SPIR-V-Enable-native-bf16-support-in-SPIR-V-dialect.patch
Lines changed: 440 additions & 0 deletions b/‎build_tools/patches/0014-SPIR-V-Enable-native-bf16-support-in-SPIR-V-dialect.patch
Lines changed: 440 additions & 0 deletions
diff --git a/‎lib/Transforms/SetSPIRVCapabilities.cpp
Lines changed: 8 additions & 3 deletions b/‎lib/Transforms/SetSPIRVCapabilities.cpp
Lines changed: 8 additions & 3 deletions
diff --git a/‎test/Integration/Dialect/Gpu/EltwiseAdd_BF16.mlir
Lines changed: 51 additions & 0 deletions b/‎test/Integration/Dialect/Gpu/EltwiseAdd_BF16.mlir
Lines changed: 51 additions & 0 deletions
diff --git a/‎test/Integration/Dialect/Gpu/gpu-to-llvm.pp
Lines changed: 28 additions & 0 deletions b/‎test/Integration/Dialect/Gpu/gpu-to-llvm.pp
Lines changed: 28 additions & 0 deletions
@@ -54,11 +54,12 @@ struct SetSPIRVCapabilitiesPass
     spirv::Capability caps_opencl[] = {
         // clang-format off
         spirv::Capability::Addresses,
+        spirv::Capability::Bfloat16ConversionINTEL,
+        spirv::Capability::BFloat16TypeKHR,
         spirv::Capability::Float16Buffer,
         spirv::Capability::Int64,
         spirv::Capability::Int16,
         spirv::Capability::Int8,
-        spirv::Capability::Bfloat16ConversionINTEL,
         spirv::Capability::Kernel,
         spirv::Capability::Linkage,
         spirv::Capability::Vector16,
@@ -77,10 +78,14 @@ struct SetSPIRVCapabilitiesPass
         // clang-format on
     };
     spirv::Extension exts_opencl[] = {
-        spirv::Extension::SPV_INTEL_bfloat16_conversion,
+        // clang-format off
         spirv::Extension::SPV_EXT_shader_atomic_float_add,
+        spirv::Extension::SPV_KHR_bfloat16,
         spirv::Extension::SPV_KHR_expect_assume,
-        spirv::Extension::SPV_INTEL_vector_compute};
+        spirv::Extension::SPV_INTEL_bfloat16_conversion,
+        spirv::Extension::SPV_INTEL_vector_compute
+        // clang-format on
+    };
     spirv::Extension exts_vulkan[] = {
         spirv::Extension::SPV_KHR_storage_buffer_storage_class};
     if (m_clientAPI == "opencl") {
 
@@ -0,0 +1,51 @@
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/gpu-to-llvm.pp \
+// RUN:                                       --runner imex-cpu-runner  -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/gpu-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner  -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+
+module @eltwise_add attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_10x20xbf16 : memref<10x20xbf16> = dense<5.000000e-01>
+  func.func @test(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>) -> memref<10x20xbf16> {
+    %c20 = arith.constant 20 : index
+    %c10 = arith.constant 10 : index
+    %c1 = arith.constant 1 : index
+    %memref = gpu.alloc  host_shared () : memref<10x20xbf16>
+    memref.copy %arg1, %memref : memref<10x20xbf16> to memref<10x20xbf16>
+    %memref_0 = gpu.alloc  host_shared () : memref<10x20xbf16>
+    memref.copy %arg0, %memref_0 : memref<10x20xbf16> to memref<10x20xbf16>
+    %memref_1 = gpu.alloc  host_shared () : memref<10x20xbf16>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c10, %c20, %c1) threads in (%c1, %c1, %c1)  args(%memref_0 : memref<10x20xbf16>, %memref : memref<10x20xbf16>, %memref_1 : memref<10x20xbf16>)
+    %alloc = memref.alloc() : memref<10x20xbf16>
+    memref.copy %memref_1, %alloc : memref<10x20xbf16> to memref<10x20xbf16>
+    gpu.dealloc  %memref_1 : memref<10x20xbf16>
+    gpu.dealloc  %memref_0 : memref<10x20xbf16>
+    gpu.dealloc  %memref : memref<10x20xbf16>
+    return %alloc : memref<10x20xbf16>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, BFloat16TypeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute, SPV_KHR_bfloat16]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>, %arg2: memref<10x20xbf16>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 10, 20, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %block_id_x = gpu.block_id  x
+      %block_id_y = gpu.block_id  y
+      %0 = memref.load %arg0[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      %1 = memref.load %arg1[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      %2 = arith.addf %0, %1 : bf16
+      memref.store %2, %arg2[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      gpu.return
+    }
+  }
+  func.func @main() {
+    %0 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16>
+    %1 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16>
+    %2 = call @test(%0, %1) : (memref<10x20xbf16>, memref<10x20xbf16>) -> memref<10x20xbf16>
+    %cast = memref.cast %2 : memref<10x20xbf16> to memref<*xbf16>
+    // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}}
+    // CHECK-COUNT-200: 1
+    call @printMemrefBF16(%cast) : (memref<*xbf16>) -> ()
+    return
+  }
+   func.func private @printMemrefBF16(memref<*xbf16>)  attributes {llvm.emit_c_interface}
+}
@@ -0,0 +1,28 @@
+// gpu dialect with intel intrinsic functions (func dialect) to
+// llvm dialect (for host code) and
+// spirv dialect (for device code) lowering pipeline.
+// Ready for imex runner starting from GPU dialect.
+builtin.module(
+    imex-vector-linearize
+    reconcile-unrealized-casts
+    imex-convert-gpu-to-spirv
+    spirv.module(spirv-lower-abi-attrs
+             spirv-update-vce)
+    func.func(llvm-request-c-wrappers)
+    serialize-spirv
+    convert-vector-to-scf
+    convert-gpu-to-gpux
+    convert-scf-to-cf
+    convert-cf-to-llvm
+    convert-vector-to-llvm
+    convert-index-to-llvm
+    convert-arith-to-llvm
+    convert-func-to-llvm
+    convert-math-to-llvm
+    convert-gpux-to-llvm
+    convert-index-to-llvm
+    expand-strided-metadata
+    lower-affine
+    finalize-memref-to-llvm
+    reconcile-unrealized-casts)
+// End