|
| 1 | +// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx950 \ |
| 2 | +// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target{for-rocdl=true})))))" %s | FileCheck %s |
| 3 | + |
| 4 | +#pipeline_layout = #hal.pipeline.layout<bindings = [ |
| 5 | + #hal.pipeline.binding<storage_buffer, ReadOnly>, |
| 6 | + #hal.pipeline.binding<storage_buffer, ReadOnly>, |
| 7 | + #hal.pipeline.binding<storage_buffer> |
| 8 | +]> |
| 9 | +#translation = #iree_codegen.translation_info<pipeline = |
| 10 | + LLVMGPUTileAndFuse |
| 11 | + workgroup_size = [256, 1, 1] |
| 12 | + subgroup_size = 64, |
| 13 | + { |
| 14 | + gpu_pipeline_options = #iree_gpu.pipeline_options< |
| 15 | + prefetch_num_stages = 0, |
| 16 | + no_reduce_shared_memory_bank_conflicts = false, |
| 17 | + use_igemm_convolution = true> |
| 18 | + }> |
| 19 | +#config = #iree_gpu.lowering_config<{ |
| 20 | + workgroup = [1, 4, 16, 256, 0], |
| 21 | + reduction = [0, 0, 0, 0, 2], |
| 22 | + subgroup = [1, 4, 1, 4, 0], |
| 23 | + mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F16>, |
| 24 | + promote_operands = [0, 1] |
| 25 | +}> |
| 26 | +hal.executable private @conv_nhwc_f16 { |
| 27 | + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { |
| 28 | + hal.executable.export public @conv_nhwc_f16 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) { |
| 29 | + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() |
| 30 | + hal.return %x, %y, %z : index, index, index |
| 31 | + } |
| 32 | + builtin.module { |
| 33 | + func.func @conv_nhwc_f16() attributes {translation_info = #translation} { |
| 34 | + %cst = arith.constant 0.000000e+00 : f32 |
| 35 | + %c0 = arith.constant 0 : index |
| 36 | + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x34x34x1280xf16>> |
| 37 | + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1280x1280xf16>> |
| 38 | + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32x32x1280xf32>> |
| 39 | + %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 1280], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x34x34x1280xf16>> -> tensor<2x34x34x1280xf16> |
| 40 | + %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 1280, 1280], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1280x1280xf16>> -> tensor<3x3x1280x1280xf16> |
| 41 | + %5 = tensor.empty() : tensor<2x32x32x1280xf32> |
| 42 | + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x32x1280xf32>) -> tensor<2x32x32x1280xf32> |
| 43 | + %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>, lowering_config = #config} ins(%3, %4 : tensor<2x34x34x1280xf16>, tensor<3x3x1280x1280xf16>) outs(%6 : tensor<2x32x32x1280xf32>) -> tensor<2x32x32x1280xf32> |
| 44 | + iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 32, 1280], strides = [1, 1, 1, 1] : tensor<2x32x32x1280xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32x32x1280xf32>> |
| 45 | + return |
| 46 | + } |
| 47 | + } |
| 48 | + } |
| 49 | +} |
| 50 | + |
| 51 | +// CHECK-LABEL: func @conv_nhwc_f16 |
| 52 | +// CHECK: scf.forall |
| 53 | +// CHECK: scf.for {{.*}} iter_args |
| 54 | +// CHECK-DAG: vector.transfer_read {{.*}}memref<2x34x34x1280xf16, #amdgpu.address_space<fat_raw_buffer>>{{.*}}vector<8xf16> |
| 55 | +// CHECK-DAG: vector.transfer_write {{.*}}memref<1x4x16x{{.*}}xf16, {{.*}}#gpu.address_space<workgroup>> |
| 56 | +// CHECK-DAG: vector.transfer_read {{.*}}memref<11520x1280xf16, #amdgpu.address_space<fat_raw_buffer>>{{.*}}vector<8xf16> |
| 57 | +// CHECK-DAG: vector.transfer_write {{.*}}memref<64x{{.*}}xf16, {{.*}}#gpu.address_space<workgroup>> |
| 58 | +// CHECK: gpu.barrier |
| 59 | +// CHECK: vector.transfer_read {{.*}}#gpu.address_space<workgroup> |
| 60 | +// CHECK: amdgpu.transpose_load {{.*}}#gpu.address_space<workgroup>{{.*}}vector<4xf16> |
| 61 | +// CHECK: amdgpu.mfma 16x16x32 {{.*}} vector<8xf16>, vector<8xf16>, vector<4xf32> |
| 62 | +// CHECK: scf.yield |
| 63 | + |
| 64 | +// ----- |
| 65 | + |
| 66 | +#pipeline_layout_unaligned = #hal.pipeline.layout<bindings = [ |
| 67 | + #hal.pipeline.binding<storage_buffer, ReadOnly>, |
| 68 | + #hal.pipeline.binding<storage_buffer, ReadOnly>, |
| 69 | + #hal.pipeline.binding<storage_buffer> |
| 70 | +]> |
| 71 | +#translation_unaligned = #iree_codegen.translation_info<pipeline = |
| 72 | + LLVMGPUTileAndFuse |
| 73 | + workgroup_size = [256, 1, 1] |
| 74 | + subgroup_size = 64, |
| 75 | + { |
| 76 | + gpu_pipeline_options = #iree_gpu.pipeline_options< |
| 77 | + prefetch_num_stages = 0, |
| 78 | + no_reduce_shared_memory_bank_conflicts = false, |
| 79 | + use_igemm_convolution = true> |
| 80 | + }> |
| 81 | +#config_unaligned = #iree_gpu.lowering_config<{ |
| 82 | + padding = [2, 1, 32, 16, 32], |
| 83 | + workgroup = [2, 1, 32, 16, 0], |
| 84 | + reduction = [0, 0, 0, 0, 1], |
| 85 | + subgroup = [1, 1, 1, 1, 0], |
| 86 | + mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F16>, |
| 87 | + promote_operands = [0, 1] |
| 88 | +}> |
| 89 | +hal.executable private @conv_nhwc_unaligned_f16 { |
| 90 | + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { |
| 91 | + hal.executable.export public @conv_nhwc_unaligned_f16 ordinal(0) layout(#pipeline_layout_unaligned) count(%arg0: !hal.device) -> (index, index, index) { |
| 92 | + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() |
| 93 | + hal.return %x, %y, %z : index, index, index |
| 94 | + } |
| 95 | + builtin.module { |
| 96 | + func.func @conv_nhwc_unaligned_f16() attributes {translation_info = #translation_unaligned} { |
| 97 | + %cst = arith.constant 0.000000e+00 : f32 |
| 98 | + %c0 = arith.constant 0 : index |
| 99 | + %0 = hal.interface.binding.subspan layout(#pipeline_layout_unaligned) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> |
| 100 | + %1 = hal.interface.binding.subspan layout(#pipeline_layout_unaligned) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1281x1281xf16>> |
| 101 | + %2 = hal.interface.binding.subspan layout(#pipeline_layout_unaligned) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>> |
| 102 | + %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16> |
| 103 | + %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 1281, 1281], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1281x1281xf16>> -> tensor<3x3x1281x1281xf16> |
| 104 | + %5 = tensor.empty() : tensor<2x17x17x1281xf32> |
| 105 | + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x17x17x1281xf32>) -> tensor<2x17x17x1281xf32> |
| 106 | + %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>, lowering_config = #config_unaligned} ins(%3, %4 : tensor<2x35x35x1281xf16>, tensor<3x3x1281x1281xf16>) outs(%6 : tensor<2x17x17x1281xf32>) -> tensor<2x17x17x1281xf32> |
| 107 | + iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>> |
| 108 | + return |
| 109 | + } |
| 110 | + } |
| 111 | + } |
| 112 | +} |
| 113 | + |
| 114 | +// CHECK-LABEL: func @conv_nhwc_unaligned_f16 |
| 115 | +// CHECK: scf.forall |
| 116 | +// CHECK: scf.for {{.*}} iter_args |
| 117 | +// CHECK-DAG: vector.transfer_read {{.*}}memref<2x35x35x1281xf16, #amdgpu.address_space<fat_raw_buffer>> |
| 118 | +// CHECK-DAG: vector.transfer_write {{.*}}memref<2x1x32x{{.*}}xf16, {{.*}}#gpu.address_space<workgroup>> |
| 119 | +// CHECK-DAG: vector.transfer_read {{.*}}memref<11529x1281xf16, #amdgpu.address_space<fat_raw_buffer>> |
| 120 | +// CHECK-DAG: vector.transfer_write {{.*}}memref<32x{{.*}}xf16, {{.*}}#gpu.address_space<workgroup>> |
| 121 | +// CHECK: gpu.barrier |
| 122 | +// CHECK: vector.transfer_read {{.*}}#gpu.address_space<workgroup> |
| 123 | +// CHECK: amdgpu.transpose_load {{.*}}#gpu.address_space<workgroup>{{.*}}vector<4xf16> |
| 124 | +// CHECK: amdgpu.mfma 16x16x32 {{.*}} vector<8xf16>, vector<8xf16>, vector<4xf32> |
| 125 | +// CHECK: scf.yield |
| 126 | + |
| 127 | +// ----- |
| 128 | + |
| 129 | +#pipeline_layout_backward = #hal.pipeline.layout<bindings = [ |
| 130 | + #hal.pipeline.binding<storage_buffer, "ReadOnly">, |
| 131 | + #hal.pipeline.binding<storage_buffer, "ReadOnly">, |
| 132 | + #hal.pipeline.binding<storage_buffer> |
| 133 | +]> |
| 134 | +#translation_backward = #iree_codegen.translation_info<pipeline = |
| 135 | + LLVMGPUTileAndFuse |
| 136 | + workgroup_size = [256, 1, 1] |
| 137 | + subgroup_size = 64, |
| 138 | + { |
| 139 | + gpu_pipeline_options = #iree_gpu.pipeline_options< |
| 140 | + prefetch_num_stages = 0, |
| 141 | + no_reduce_shared_memory_bank_conflicts = false, |
| 142 | + use_igemm_convolution = true> |
| 143 | + }> |
| 144 | +#config_backward = #iree_gpu.lowering_config<{ |
| 145 | + padding = [2, 32, 64, 64], |
| 146 | + workgroup = [2, 32, 64, 0], |
| 147 | + reduction = [0, 0, 0, 2], |
| 148 | + subgroup = [2, 2, 1, 0], |
| 149 | + mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_BF16>, |
| 150 | + promote_operands = [0, 1] |
| 151 | +}> |
| 152 | +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> |
| 153 | +#map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2)> |
| 154 | +#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> |
| 155 | +#map3 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> |
| 156 | +hal.executable private @conv_input_backward_bf16 { |
| 157 | + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { |
| 158 | + hal.executable.export public @conv_input_backward_bf16 ordinal(0) layout(#pipeline_layout_backward) count(%arg0: !hal.device) -> (index, index, index) { |
| 159 | + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() |
| 160 | + hal.return %x, %y, %z : index, index, index |
| 161 | + } |
| 162 | + builtin.module { |
| 163 | + func.func @conv_input_backward_bf16() attributes {translation_info = #translation_backward} { |
| 164 | + %cst = arith.constant 0.000000e+00 : f32 |
| 165 | + %c0 = arith.constant 0 : index |
| 166 | + %0 = hal.interface.binding.subspan layout(#pipeline_layout_backward) binding(0) alignment(64) offset(%c0) flags("ReadOnly") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16x21x384xbf16>> |
| 167 | + %1 = hal.interface.binding.subspan layout(#pipeline_layout_backward) binding(1) alignment(64) offset(%c0) flags("ReadOnly") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<384x192xbf16>> |
| 168 | + %2 = hal.interface.binding.subspan layout(#pipeline_layout_backward) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16x21x192xbf16>> |
| 169 | + %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 21, 384], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16x21x384xbf16>> -> tensor<16x21x384xbf16> |
| 170 | + %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 192], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<384x192xbf16>> -> tensor<384x192xbf16> |
| 171 | + %5 = tensor.empty() : tensor<16x21x192xf32> |
| 172 | + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<16x21x192xf32>) -> tensor<16x21x192xf32> |
| 173 | + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<16x21x384xbf16>, tensor<384x192xbf16>) outs(%6 : tensor<16x21x192xf32>) attrs = {lowering_config = #config_backward} { |
| 174 | + ^bb0(%in: bf16, %in_0: bf16, %out: f32): |
| 175 | + %10 = arith.extf %in : bf16 to f32 |
| 176 | + %11 = arith.extf %in_0 : bf16 to f32 |
| 177 | + %12 = arith.mulf %10, %11 : f32 |
| 178 | + %13 = arith.addf %out, %12 : f32 |
| 179 | + linalg.yield %13 : f32 |
| 180 | + } -> tensor<16x21x192xf32> |
| 181 | + %8 = tensor.empty() : tensor<16x21x192xbf16> |
| 182 | + %9 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7 : tensor<16x21x192xf32>) outs(%8 : tensor<16x21x192xbf16>) { |
| 183 | + ^bb0(%in: f32, %out: bf16): |
| 184 | + %10 = arith.truncf %in : f32 to bf16 |
| 185 | + linalg.yield %10 : bf16 |
| 186 | + } -> tensor<16x21x192xbf16> |
| 187 | + iree_tensor_ext.dispatch.tensor.store %9, %2, offsets = [0, 0, 0], sizes = [16, 21, 192], strides = [1, 1, 1] : tensor<16x21x192xbf16> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16x21x192xbf16>> |
| 188 | + return |
| 189 | + } |
| 190 | + } |
| 191 | + } |
| 192 | +} |
| 193 | + |
| 194 | +// CHECK-LABEL: func @conv_input_backward_bf16 |
| 195 | +// CHECK: scf.forall |
| 196 | +// CHECK: scf.for {{.*}} iter_args |
| 197 | +// CHECK-DAG: vector.transfer_read {{.*}}memref<16x21x384xbf16, #amdgpu.address_space<fat_raw_buffer>>{{.*}}vector<8xbf16> |
| 198 | +// CHECK-DAG: vector.transfer_write {{.*}}memref<2x32x{{.*}}xbf16, {{.*}}#gpu.address_space<workgroup>> |
| 199 | +// CHECK-DAG: vector.transfer_read {{.*}}memref<384x192xbf16, #amdgpu.address_space<fat_raw_buffer>>{{.*}}vector<8xbf16> |
| 200 | +// CHECK-DAG: vector.transfer_write {{.*}}memref<64x{{.*}}xbf16, {{.*}}#gpu.address_space<workgroup>> |
| 201 | +// CHECK: gpu.barrier |
| 202 | +// CHECK: vector.transfer_read {{.*}}#gpu.address_space<workgroup> |
| 203 | +// CHECK: amdgpu.transpose_load {{.*}}#gpu.address_space<workgroup>{{.*}}vector<4xbf16> |
| 204 | +// CHECK: amdgpu.mfma 16x16x32 {{.*}} vector<8xbf16>, vector<8xbf16>, vector<4xf32> |
| 205 | +// CHECK: scf.yield |
0 commit comments