|
| 1 | +// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 \ |
| 2 | +// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-configure-target-executable-variants{target=rocm})))" \ |
| 3 | +// RUN: --iree-hip-enable-tensor-ukernels \ |
| 4 | +// RUN: --verify-diagnostics %s | FileCheck %s |
| 5 | + |
| 6 | +// Make sure we can match and insert a tensor ukernel. |
| 7 | + |
| 8 | +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> |
| 9 | +#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)> |
| 10 | +#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> |
| 11 | +#pipeline_layout = #hal.pipeline.layout<bindings = [ |
| 12 | + #hal.pipeline.binding<storage_buffer>, |
| 13 | + #hal.pipeline.binding<storage_buffer>, |
| 14 | + #hal.pipeline.binding<storage_buffer> |
| 15 | +]> |
| 16 | +hal.executable public @main { |
| 17 | + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { |
| 18 | + hal.executable.export public @matmul_f8 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) { |
| 19 | + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice |
| 20 | + hal.return %x, %y, %z : index, index, index |
| 21 | + } |
| 22 | + builtin.module { |
| 23 | + func.func @matmul_f8() { |
| 24 | + %cst = arith.constant 0.000000e+00 : f32 |
| 25 | + %c0 = arith.constant 0 : index |
| 26 | + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x128x4096xf8E4M3FNUZ>> |
| 27 | + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x4096xf8E4M3FNUZ>> |
| 28 | + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1x128x1024xf32>> |
| 29 | + %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 128, 4096], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x128x4096xf8E4M3FNUZ>> -> tensor<1x128x4096xf8E4M3FNUZ> |
| 30 | + %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 4096], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x4096xf8E4M3FNUZ>> -> tensor<1024x4096xf8E4M3FNUZ> |
| 31 | + %5 = tensor.empty() : tensor<1x128x1024xf32> |
| 32 | + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32> |
| 33 | + %7 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x128x4096xf8E4M3FNUZ>, tensor<1024x4096xf8E4M3FNUZ>) outs(%6 : tensor<1x128x1024xf32>) { |
| 34 | + ^bb0(%in: f8E4M3FNUZ, %in_4: f8E4M3FNUZ, %out: f32): |
| 35 | + %12 = arith.extf %in : f8E4M3FNUZ to f32 |
| 36 | + %13 = arith.extf %in_4 : f8E4M3FNUZ to f32 |
| 37 | + %14 = arith.mulf %12, %13 : f32 |
| 38 | + %15 = arith.addf %out, %14 : f32 |
| 39 | + linalg.yield %15 : f32 |
| 40 | + } -> tensor<1x128x1024xf32> |
| 41 | + iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [1, 128, 1024], strides = [1, 1, 1] : tensor<1x128x1024xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1x128x1024xf32>> |
| 42 | + return |
| 43 | + } |
| 44 | + } |
| 45 | + } |
| 46 | +} |
| 47 | +// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64 |
| 48 | +// CHECK: func.func @matmul_f8 |
| 49 | +// CHECK-SAME: translation_info = #[[TRANSLATION]] |
| 50 | +// CHECK: linalg.generic |
| 51 | +// CHECK-SAME: iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor> |
| 52 | +// CHECK-SAME: lowering_config = #iree_gpu.lowering_config |
| 53 | +// CHECK: util.func private @pingpong_medium_f8_expanded |
| 54 | +// CHECK: iree_codegen.inner_tiled |
0 commit comments