|
| 1 | +// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/gpu-to-llvm.pp \ |
| 2 | +// RUN: --runner imex-cpu-runner -e main \ |
| 3 | +// RUN: --entry-point-result=void \ |
| 4 | +// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck |
| 5 | +// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/gpu-to-llvm.pp \ |
| 6 | +// RUN: --runner imex-cpu-runner -e main \ |
| 7 | +// RUN: --entry-point-result=void \ |
| 8 | +// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck |
| 9 | + |
| 10 | +module @eltwise_add attributes {gpu.container_module} { |
| 11 | + memref.global "private" constant @__constant_10x20xbf16 : memref<10x20xbf16> = dense<5.000000e-01> |
| 12 | + func.func @test(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>) -> memref<10x20xbf16> { |
| 13 | + %c20 = arith.constant 20 : index |
| 14 | + %c10 = arith.constant 10 : index |
| 15 | + %c1 = arith.constant 1 : index |
| 16 | + %memref = gpu.alloc host_shared () : memref<10x20xbf16> |
| 17 | + memref.copy %arg1, %memref : memref<10x20xbf16> to memref<10x20xbf16> |
| 18 | + %memref_0 = gpu.alloc host_shared () : memref<10x20xbf16> |
| 19 | + memref.copy %arg0, %memref_0 : memref<10x20xbf16> to memref<10x20xbf16> |
| 20 | + %memref_1 = gpu.alloc host_shared () : memref<10x20xbf16> |
| 21 | + gpu.launch_func @test_kernel::@test_kernel blocks in (%c10, %c20, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<10x20xbf16>, %memref : memref<10x20xbf16>, %memref_1 : memref<10x20xbf16>) |
| 22 | + %alloc = memref.alloc() : memref<10x20xbf16> |
| 23 | + memref.copy %memref_1, %alloc : memref<10x20xbf16> to memref<10x20xbf16> |
| 24 | + gpu.dealloc %memref_1 : memref<10x20xbf16> |
| 25 | + gpu.dealloc %memref_0 : memref<10x20xbf16> |
| 26 | + gpu.dealloc %memref : memref<10x20xbf16> |
| 27 | + return %alloc : memref<10x20xbf16> |
| 28 | + } |
| 29 | + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, BFloat16TypeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute, SPV_KHR_bfloat16]>, api=OpenCL, #spirv.resource_limits<>>} { |
| 30 | + gpu.func @test_kernel(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>, %arg2: memref<10x20xbf16>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 10, 20, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { |
| 31 | + %block_id_x = gpu.block_id x |
| 32 | + %block_id_y = gpu.block_id y |
| 33 | + %0 = memref.load %arg0[%block_id_x, %block_id_y] : memref<10x20xbf16> |
| 34 | + %1 = memref.load %arg1[%block_id_x, %block_id_y] : memref<10x20xbf16> |
| 35 | + %2 = arith.addf %0, %1 : bf16 |
| 36 | + memref.store %2, %arg2[%block_id_x, %block_id_y] : memref<10x20xbf16> |
| 37 | + gpu.return |
| 38 | + } |
| 39 | + } |
| 40 | + func.func @main() { |
| 41 | + %0 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16> |
| 42 | + %1 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16> |
| 43 | + %2 = call @test(%0, %1) : (memref<10x20xbf16>, memref<10x20xbf16>) -> memref<10x20xbf16> |
| 44 | + %cast = memref.cast %2 : memref<10x20xbf16> to memref<*xbf16> |
| 45 | + // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}} |
| 46 | + // CHECK-COUNT-200: 1 |
| 47 | + call @printMemrefBF16(%cast) : (memref<*xbf16>) -> () |
| 48 | + return |
| 49 | + } |
| 50 | + func.func private @printMemrefBF16(memref<*xbf16>) attributes {llvm.emit_c_interface} |
| 51 | +} |
0 commit comments