Skip to content

Commit 7d595e4

Browse files
authored
[Codegen] Update tests to be in correct state for strategy selection (#21647)
When trying to select vector distribution, there is [this code](https://github.com/iree-org/iree/blob/980d1f3638d259b4d4360c816023d1885c9b03fa/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp#L825) that relies on finding a linalg.generic by walking backwards from a ` iree_tensor_ext.dispatch.tensor.store` op. So if there is no such store op in the IR, vector distribution is silently skipped, and warp reduction is selected. That is what was happening in the last 2 tests in [config_matvec.mlir](https://github.com/iree-org/iree/blame/main/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir). I have run these 2 tests through iree-compile to see what their IR looks like just before the pass `iree-llvmgpu-select-lowering-strategy`, and replaced them with these lowered versions (which contain the `store` ops needed to correctly select the vector distribute pipeline). With this change, we see that vector distribution is indeed selected. The 2 tests were introduced in #19381 and #20585 which are both look like warp-reduction specific PRs, i.e. not important as warp-reduction is being removed. --------- Signed-off-by: James Newling <[email protected]>
1 parent e5c6a3f commit 7d595e4

File tree

1 file changed

+101
-58
lines changed

1 file changed

+101
-58
lines changed

compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir

Lines changed: 101 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -416,73 +416,116 @@ func.func @not_vmt() {
416416

417417
// -----
418418

419-
func.func @dynamic_parallel_dims(%dynsize : index, %input : tensor<4x?x4096xf16>) -> tensor<4x?xf32> {
420-
%cst = arith.constant 0.0 : f32
421-
%0 = tensor.empty(%dynsize) : tensor<4x?xf32>
422-
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x?xf32>) -> tensor<4x?xf32>
423-
%2 = linalg.generic {
424-
indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
425-
iterator_types = ["parallel", "parallel", "reduction"]}
426-
ins(%input : tensor<4x?x4096xf16>) outs(%1 : tensor<4x?xf32>) {
419+
420+
func.func @dynamic_parallel_dims_dispatch_0_reduction_Dx4096_f16xf32() {
421+
%c32_i64 = arith.constant 32 : i64
422+
%cst = arith.constant 0.000000e+00 : f32
423+
%c0 = arith.constant 0 : index
424+
%0 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
425+
%1 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
426+
%2 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
427+
%3 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
428+
%4 = arith.extui %0 : i32 to i64
429+
%5 = arith.extui %1 : i32 to i64
430+
%6 = arith.shli %5, %c32_i64 : i64
431+
%7 = arith.ori %4, %6 : i64
432+
%8 = arith.index_castui %7 : i64 to index
433+
%9 = arith.extui %2 : i32 to i64
434+
%10 = arith.extui %3 : i32 to i64
435+
%11 = arith.shli %10, %c32_i64 : i64
436+
%12 = arith.ori %9, %11 : i64
437+
%13 = arith.index_castui %12 : i64 to index
438+
%14:2 = util.assume.int
439+
%8<udiv = 4>,
440+
%13<umin = 0, umax = 36028797018963964, udiv = 4>
441+
: index, index
442+
%15 = iree_tensor_ext.dispatch.workload.ordinal %14#0, 0 : index
443+
%16 = iree_tensor_ext.dispatch.workload.ordinal %14#1, 1 : index
444+
%17 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x4096xf16>>{%16}
445+
%18 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?xf32>>{%15}
446+
%19 = iree_tensor_ext.dispatch.tensor.load %17, offsets = [0, 0], sizes = [%16, 4096], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x4096xf16>>{%16} -> tensor<?x4096xf16>
447+
%20 = tensor.empty(%15) : tensor<?xf32>
448+
%21 = linalg.fill ins(%cst : f32) outs(%20 : tensor<?xf32>) -> tensor<?xf32>
449+
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%19 : tensor<?x4096xf16>) outs(%21 : tensor<?xf32>) {
427450
^bb0(%in: f16, %out: f32):
428-
%3 = arith.extf %in : f16 to f32
429-
%4 = arith.addf %3, %out : f32
430-
linalg.yield %4 : f32
431-
} -> tensor<4x?xf32>
432-
return %2 : tensor<4x?xf32>
433-
}
434-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 64]{{\]}}
435-
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
436-
// CHECK: func @dynamic_parallel_dims
437-
// CHECK-SAME: translation_info = #[[TRANSLATION]]
438-
// CHECK: linalg.generic
439-
// CHECK-SAME: lowering_config = #[[CONFIG]]
440-
441-
// CDNA3-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 32]{{\]}}
442-
// CDNA3-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [32, 1, 1] subgroup_size = 32>
443-
// CDNA3: func @dynamic_parallel_dims
444-
// CDNA3-SAME: translation_info = #[[TRANSLATION]]
445-
// CDNA3: linalg.generic
446-
// CDNA3-SAME: lowering_config = #[[CONFIG]]
451+
%23 = arith.extf %in : f16 to f32
452+
%24 = arith.addf %23, %out : f32
453+
linalg.yield %24 : f32
454+
} -> tensor<?xf32>
455+
iree_tensor_ext.dispatch.tensor.store %22, %18, offsets = [0], sizes = [%15], strides = [1] : tensor<?xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?xf32>>{%15}
456+
return
457+
}
458+
459+
// CHECK: #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
460+
// CHECK-SAME: workgroup_size = [512, 1, 1] subgroup_size = 64
461+
462+
// CDNA: #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
463+
// CDNA-SAME: workgroup_size = [512, 1, 1] subgroup_size = 64
464+
447465

448466
// -----
449467

450-
#map = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
451-
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>
452-
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
453-
#map3 = affine_map<(d0, d1) -> (d0, d1)>
454-
#map4 = affine_map<(d0, d1) -> ()>
455-
func.func @test_dyn_reduction(%arg0: tensor<128x?x32xf8E4M3FNUZ>, %arg1: tensor<128x?x32x128xf8E4M3FNUZ>, %arg2: tensor<f32>) -> tensor<128x128xf8E4M3FNUZ> {
468+
func.func @test_dyn_reduction() {
469+
%c32 = arith.constant 32 : index
470+
%c32_i64 = arith.constant 32 : i64
456471
%cst = arith.constant 0.000000e+00 : f32
457472
%cst_0 = arith.constant -2.400000e+02 : f8E4M3FNUZ
458473
%cst_1 = arith.constant 2.400000e+02 : f8E4M3FNUZ
459-
%0 = tensor.empty() : tensor<128x128xf8E4M3FNUZ>
460-
%1 = tensor.empty() : tensor<128x128xf32>
461-
%2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<128x128xf32>) -> tensor<128x128xf32>
462-
%3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%arg0, %arg1 : tensor<128x?x32xf8E4M3FNUZ>, tensor<128x?x32x128xf8E4M3FNUZ>) outs(%2 : tensor<128x128xf32>) {
474+
%c0 = arith.constant 0 : index
475+
%0 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
476+
%1 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
477+
%2 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
478+
%3 = hal.interface.constant.load layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
479+
%4 = arith.extui %0 : i32 to i64
480+
%5 = arith.extui %1 : i32 to i64
481+
%6 = arith.shli %5, %c32_i64 : i64
482+
%7 = arith.ori %4, %6 : i64
483+
%8 = arith.index_castui %7 : i64 to index
484+
%9 = arith.extui %2 : i32 to i64
485+
%10 = arith.extui %3 : i32 to i64
486+
%11 = arith.shli %10, %c32_i64 : i64
487+
%12 = arith.ori %9, %11 : i64
488+
%13 = arith.index_castui %12 : i64 to index
489+
%14:2 = util.assume.int
490+
%8<umin = 0, umax = 288230376151711712, udiv = 32>,
491+
%13<umin = 0, umax = 288230376151711712, udiv = 32>
492+
: index, index
493+
%15 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>>
494+
%16 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<128x128xf8E4M3FNUZ>>
495+
%17 = iree_tensor_ext.dispatch.workload.ordinal %14#0, 0 : index
496+
%18 = iree_tensor_ext.dispatch.workload.ordinal %14#1, 1 : index
497+
%19 = arith.divsi %17, %c32 : index
498+
%20 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x?x32xf8E4M3FNUZ>>{%19}
499+
%21 = arith.divsi %18, %c32 : index
500+
%22 = hal.interface.binding.subspan layout(<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x?x32x128xf8E4M3FNUZ>>{%21}
501+
%23 = iree_tensor_ext.dispatch.tensor.load %15, offsets = [], sizes = [], strides = [] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
502+
%24 = tensor.empty() : tensor<128x128xf8E4M3FNUZ>
503+
%25 = tensor.empty() : tensor<128x128xf32>
504+
%26 = linalg.fill ins(%cst : f32) outs(%25 : tensor<128x128xf32>) -> tensor<128x128xf32>
505+
%27 = iree_tensor_ext.dispatch.tensor.load %20, offsets = [0, 0, 0], sizes = [128, %19, 32], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x?x32xf8E4M3FNUZ>>{%19} -> tensor<128x?x32xf8E4M3FNUZ>
506+
%28 = iree_tensor_ext.dispatch.tensor.load %22, offsets = [0, 0, 0, 0], sizes = [128, %21, 32, 128], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x?x32x128xf8E4M3FNUZ>>{%21} -> tensor<128x?x32x128xf8E4M3FNUZ>
507+
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%27, %28 : tensor<128x?x32xf8E4M3FNUZ>, tensor<128x?x32x128xf8E4M3FNUZ>) outs(%26 : tensor<128x128xf32>) {
463508
^bb0(%in: f8E4M3FNUZ, %in_2: f8E4M3FNUZ, %out: f32):
464-
%5 = arith.extf %in : f8E4M3FNUZ to f32
465-
%6 = arith.extf %in_2 : f8E4M3FNUZ to f32
466-
%7 = arith.mulf %5, %6 : f32
467-
%8 = arith.addf %out, %7 : f32
468-
linalg.yield %8 : f32
509+
%31 = arith.extf %in : f8E4M3FNUZ to f32
510+
%32 = arith.extf %in_2 : f8E4M3FNUZ to f32
511+
%33 = arith.mulf %31, %32 : f32
512+
%34 = arith.addf %out, %33 : f32
513+
linalg.yield %34 : f32
469514
} -> tensor<128x128xf32>
470-
%4 = linalg.generic {indexing_maps = [#map3, #map4, #map3], iterator_types = ["parallel", "parallel"]} ins(%3, %arg2 : tensor<128x128xf32>, tensor<f32>) outs(%0 : tensor<128x128xf8E4M3FNUZ>) {
515+
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%29, %23 : tensor<128x128xf32>, tensor<f32>) outs(%24 : tensor<128x128xf8E4M3FNUZ>) {
471516
^bb0(%in: f32, %in_2: f32, %out: f8E4M3FNUZ):
472-
%5 = arith.truncf %in : f32 to f8E4M3FNUZ
473-
%6 = arith.truncf %in_2 : f32 to f8E4M3FNUZ
474-
%7 = arith.divf %5, %6 : f8E4M3FNUZ
475-
%8 = arith.cmpf ult, %7, %cst_0 : f8E4M3FNUZ
476-
%9 = arith.select %8, %cst_0, %7 : f8E4M3FNUZ
477-
%10 = arith.cmpf ugt, %9, %cst_1 : f8E4M3FNUZ
478-
%11 = arith.select %10, %cst_1, %9 : f8E4M3FNUZ
479-
linalg.yield %11 : f8E4M3FNUZ
517+
%31 = arith.truncf %in : f32 to f8E4M3FNUZ
518+
%32 = arith.truncf %in_2 : f32 to f8E4M3FNUZ
519+
%33 = arith.divf %31, %32 : f8E4M3FNUZ
520+
%34 = arith.cmpf ult, %33, %cst_0 : f8E4M3FNUZ
521+
%35 = arith.select %34, %cst_0, %33 : f8E4M3FNUZ
522+
%36 = arith.cmpf ugt, %35, %cst_1 : f8E4M3FNUZ
523+
%37 = arith.select %36, %cst_1, %35 : f8E4M3FNUZ
524+
linalg.yield %37 : f8E4M3FNUZ
480525
} -> tensor<128x128xf8E4M3FNUZ>
481-
return %4 : tensor<128x128xf8E4M3FNUZ>
526+
iree_tensor_ext.dispatch.tensor.store %30, %16, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf8E4M3FNUZ> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<128x128xf8E4M3FNUZ>>
527+
return
482528
}
483-
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 1, 64]{{\]}}>
484-
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
485-
// CHECK: func.func @test_dyn_reduction
486-
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
487-
// CHECK: linalg.generic
488-
// CHECK-SAME: lowering_config = #[[$CONFIG]]
529+
530+
// CHECK: #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
531+
// CHECK-SAME: workgroup_size = [2, 1, 1] subgroup_size = 64,

0 commit comments

Comments
 (0)