@@ -416,73 +416,116 @@ func.func @not_vmt() {
416416
417417// -----
418418
419- func.func @dynamic_parallel_dims (%dynsize : index , %input : tensor <4 x?x4096 xf16 >) -> tensor <4 x?xf32 > {
420- %cst = arith.constant 0.0 : f32
421- %0 = tensor.empty (%dynsize ) : tensor <4 x?xf32 >
422- %1 = linalg.fill ins (%cst : f32 ) outs (%0 : tensor <4 x?xf32 >) -> tensor <4 x?xf32 >
423- %2 = linalg.generic {
424- indexing_maps = [affine_map <(d0 , d1 , d2 ) -> (d0 , d1 , d2 )>, affine_map <(d0 , d1 , d2 ) -> (d0 , d1 )>],
425- iterator_types = [" parallel" , " parallel" , " reduction" ]}
426- ins (%input : tensor <4 x?x4096 xf16 >) outs (%1 : tensor <4 x?xf32 >) {
419+
420+ func.func @dynamic_parallel_dims_dispatch_0_reduction_Dx4096_f16xf32 () {
421+ %c32_i64 = arith.constant 32 : i64
422+ %cst = arith.constant 0.000000e+00 : f32
423+ %c0 = arith.constant 0 : index
424+ %0 = hal.interface.constant.load layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) ordinal (0 ) : i32
425+ %1 = hal.interface.constant.load layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) ordinal (1 ) : i32
426+ %2 = hal.interface.constant.load layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) ordinal (2 ) : i32
427+ %3 = hal.interface.constant.load layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) ordinal (3 ) : i32
428+ %4 = arith.extui %0 : i32 to i64
429+ %5 = arith.extui %1 : i32 to i64
430+ %6 = arith.shli %5 , %c32_i64 : i64
431+ %7 = arith.ori %4 , %6 : i64
432+ %8 = arith.index_castui %7 : i64 to index
433+ %9 = arith.extui %2 : i32 to i64
434+ %10 = arith.extui %3 : i32 to i64
435+ %11 = arith.shli %10 , %c32_i64 : i64
436+ %12 = arith.ori %9 , %11 : i64
437+ %13 = arith.index_castui %12 : i64 to index
438+ %14:2 = util.assume.int
439+ %8 <udiv = 4 >,
440+ %13 <umin = 0 , umax = 36028797018963964 , udiv = 4 >
441+ : index , index
442+ %15 = iree_tensor_ext.dispatch.workload.ordinal %14#0 , 0 : index
443+ %16 = iree_tensor_ext.dispatch.workload.ordinal %14#1 , 1 : index
444+ %17 = hal.interface.binding.subspan layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) binding (0 ) alignment (64 ) offset (%c0 ) flags (" ReadOnly|Indirect" ) : !iree_tensor_ext.dispatch.tensor <readonly :tensor <?x4096 xf16 >>{%16 }
445+ %18 = hal.interface.binding.subspan layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) binding (1 ) alignment (64 ) offset (%c0 ) flags (Indirect ) : !iree_tensor_ext.dispatch.tensor <writeonly :tensor <?xf32 >>{%15 }
446+ %19 = iree_tensor_ext.dispatch.tensor.load %17 , offsets = [0 , 0 ], sizes = [%16 , 4096 ], strides = [1 , 1 ] : !iree_tensor_ext.dispatch.tensor <readonly :tensor <?x4096 xf16 >>{%16 } -> tensor <?x4096 xf16 >
447+ %20 = tensor.empty (%15 ) : tensor <?xf32 >
448+ %21 = linalg.fill ins (%cst : f32 ) outs (%20 : tensor <?xf32 >) -> tensor <?xf32 >
449+ %22 = linalg.generic {index ing_maps = [affine_map <(d0 , d1 ) -> (d0 , d1 )>, affine_map <(d0 , d1 ) -> (d0 )>], iterator_types = [" parallel" , " reduction" ]} ins (%19 : tensor <?x4096 xf16 >) outs (%21 : tensor <?xf32 >) {
427450 ^bb0 (%in: f16 , %out: f32 ):
428- %3 = arith.extf %in : f16 to f32
429- %4 = arith.addf %3 , %out : f32
430- linalg.yield %4 : f32
431- } -> tensor <4 x?xf32 >
432- return %2 : tensor <4 x?xf32 >
433- }
434- // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 64]{{\]}}
435- // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
436- // CHECK: func @dynamic_parallel_dims
437- // CHECK-SAME: translation_info = #[[TRANSLATION]]
438- // CHECK: linalg.generic
439- // CHECK-SAME: lowering_config = #[[CONFIG]]
440-
441- // CDNA3-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 32]{{\]}}
442- // CDNA3-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [32, 1, 1] subgroup_size = 32>
443- // CDNA3: func @dynamic_parallel_dims
444- // CDNA3-SAME: translation_info = #[[TRANSLATION]]
445- // CDNA3: linalg.generic
446- // CDNA3-SAME: lowering_config = #[[CONFIG]]
451+ %23 = arith.extf %in : f16 to f32
452+ %24 = arith.addf %23 , %out : f32
453+ linalg.yield %24 : f32
454+ } -> tensor <?xf32 >
455+ iree_tensor_ext.dispatch.tensor.store %22 , %18 , offsets = [0 ], sizes = [%15 ], strides = [1 ] : tensor <?xf32 > -> !iree_tensor_ext.dispatch.tensor <writeonly :tensor <?xf32 >>{%15 }
456+ return
457+ }
458+
459+ // CHECK: #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
460+ // CHECK-SAME: workgroup_size = [512, 1, 1] subgroup_size = 64
461+
462+ // CDNA: #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
463+ // CDNA-SAME: workgroup_size = [512, 1, 1] subgroup_size = 64
464+
447465
448466// -----
449467
450- #map = affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d2 , d3 )>
451- #map1 = affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d2 , d3 , d1 )>
452- #map2 = affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d1 )>
453- #map3 = affine_map <(d0 , d1 ) -> (d0 , d1 )>
454- #map4 = affine_map <(d0 , d1 ) -> ()>
455- func.func @test_dyn_reduction (%arg0: tensor <128 x?x32 xf8 E4 M3 FNUZ>, %arg1: tensor <128 x?x32 x128 xf8 E4 M3 FNUZ>, %arg2: tensor <f32 >) -> tensor <128 x128 xf8 E4 M3 FNUZ> {
468+ func.func @test_dyn_reduction () {
469+ %c32 = arith.constant 32 : index
470+ %c32_i64 = arith.constant 32 : i64
456471 %cst = arith.constant 0.000000e+00 : f32
457472 %cst_0 = arith.constant -2.400000e+02 : f8E4M3FNUZ
458473 %cst_1 = arith.constant 2.400000e+02 : f8E4M3FNUZ
459- %0 = tensor.empty () : tensor <128 x128 xf8 E4 M3 FNUZ>
460- %1 = tensor.empty () : tensor <128 x128 xf32 >
461- %2 = linalg.fill ins (%cst : f32 ) outs (%1 : tensor <128 x128 xf32 >) -> tensor <128 x128 xf32 >
462- %3 = linalg.generic {index ing_maps = [#map , #map1 , #map2 ], iterator_types = [" parallel" , " parallel" , " reduction" , " reduction" ]} ins (%arg0 , %arg1 : tensor <128 x?x32 xf8 E4 M3 FNUZ>, tensor <128 x?x32 x128 xf8 E4 M3 FNUZ>) outs (%2 : tensor <128 x128 xf32 >) {
474+ %c0 = arith.constant 0 : index
475+ %0 = hal.interface.constant.load layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) ordinal (0 ) : i32
476+ %1 = hal.interface.constant.load layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) ordinal (1 ) : i32
477+ %2 = hal.interface.constant.load layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) ordinal (2 ) : i32
478+ %3 = hal.interface.constant.load layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) ordinal (3 ) : i32
479+ %4 = arith.extui %0 : i32 to i64
480+ %5 = arith.extui %1 : i32 to i64
481+ %6 = arith.shli %5 , %c32_i64 : i64
482+ %7 = arith.ori %4 , %6 : i64
483+ %8 = arith.index_castui %7 : i64 to index
484+ %9 = arith.extui %2 : i32 to i64
485+ %10 = arith.extui %3 : i32 to i64
486+ %11 = arith.shli %10 , %c32_i64 : i64
487+ %12 = arith.ori %9 , %11 : i64
488+ %13 = arith.index_castui %12 : i64 to index
489+ %14:2 = util.assume.int
490+ %8 <umin = 0 , umax = 288230376151711712 , udiv = 32 >,
491+ %13 <umin = 0 , umax = 288230376151711712 , udiv = 32 >
492+ : index , index
493+ %15 = hal.interface.binding.subspan layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) binding (2 ) alignment (64 ) offset (%c0 ) flags (" ReadOnly|Indirect" ) {iree_gpu.use_rocdl_buffer_instructions } : !iree_tensor_ext.dispatch.tensor <readonly :tensor <f32 >>
494+ %16 = hal.interface.binding.subspan layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) binding (3 ) alignment (64 ) offset (%c0 ) flags (Indirect ) {iree_gpu.use_rocdl_buffer_instructions } : !iree_tensor_ext.dispatch.tensor <writeonly :tensor <128 x128 xf8 E4 M3 FNUZ>>
495+ %17 = iree_tensor_ext.dispatch.workload.ordinal %14#0 , 0 : index
496+ %18 = iree_tensor_ext.dispatch.workload.ordinal %14#1 , 1 : index
497+ %19 = arith.divsi %17 , %c32 : index
498+ %20 = hal.interface.binding.subspan layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) binding (0 ) alignment (64 ) offset (%c0 ) flags (" ReadOnly|Indirect" ) : !iree_tensor_ext.dispatch.tensor <readonly :tensor <128 x?x32 xf8 E4 M3 FNUZ>>{%19 }
499+ %21 = arith.divsi %18 , %c32 : index
500+ %22 = hal.interface.binding.subspan layout (<constants = 4 , bindings = [#hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , " ReadOnly|Indirect" >, #hal.pipeline.binding <storage_buffer , Indirect >], flags = Indirect >) binding (1 ) alignment (64 ) offset (%c0 ) flags (" ReadOnly|Indirect" ) : !iree_tensor_ext.dispatch.tensor <readonly :tensor <128 x?x32 x128 xf8 E4 M3 FNUZ>>{%21 }
501+ %23 = iree_tensor_ext.dispatch.tensor.load %15 , offsets = [], sizes = [], strides = [] : !iree_tensor_ext.dispatch.tensor <readonly :tensor <f32 >> -> tensor <f32 >
502+ %24 = tensor.empty () : tensor <128 x128 xf8 E4 M3 FNUZ>
503+ %25 = tensor.empty () : tensor <128 x128 xf32 >
504+ %26 = linalg.fill ins (%cst : f32 ) outs (%25 : tensor <128 x128 xf32 >) -> tensor <128 x128 xf32 >
505+ %27 = iree_tensor_ext.dispatch.tensor.load %20 , offsets = [0 , 0 , 0 ], sizes = [128 , %19 , 32 ], strides = [1 , 1 , 1 ] : !iree_tensor_ext.dispatch.tensor <readonly :tensor <128 x?x32 xf8 E4 M3 FNUZ>>{%19 } -> tensor <128 x?x32 xf8 E4 M3 FNUZ>
506+ %28 = iree_tensor_ext.dispatch.tensor.load %22 , offsets = [0 , 0 , 0 , 0 ], sizes = [128 , %21 , 32 , 128 ], strides = [1 , 1 , 1 , 1 ] : !iree_tensor_ext.dispatch.tensor <readonly :tensor <128 x?x32 x128 xf8 E4 M3 FNUZ>>{%21 } -> tensor <128 x?x32 x128 xf8 E4 M3 FNUZ>
507+ %29 = linalg.generic {index ing_maps = [affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d2 , d3 )>, affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d2 , d3 , d1 )>, affine_map <(d0 , d1 , d2 , d3 ) -> (d0 , d1 )>], iterator_types = [" parallel" , " parallel" , " reduction" , " reduction" ]} ins (%27 , %28 : tensor <128 x?x32 xf8 E4 M3 FNUZ>, tensor <128 x?x32 x128 xf8 E4 M3 FNUZ>) outs (%26 : tensor <128 x128 xf32 >) {
463508 ^bb0 (%in: f8E4M3FNUZ , %in_2: f8E4M3FNUZ , %out: f32 ):
464- %5 = arith.extf %in : f8E4M3FNUZ to f32
465- %6 = arith.extf %in_2 : f8E4M3FNUZ to f32
466- %7 = arith.mulf %5 , %6 : f32
467- %8 = arith.addf %out , %7 : f32
468- linalg.yield %8 : f32
509+ %31 = arith.extf %in : f8E4M3FNUZ to f32
510+ %32 = arith.extf %in_2 : f8E4M3FNUZ to f32
511+ %33 = arith.mulf %31 , %32 : f32
512+ %34 = arith.addf %out , %33 : f32
513+ linalg.yield %34 : f32
469514 } -> tensor <128 x128 xf32 >
470- %4 = linalg.generic {index ing_maps = [#map3 , #map4 , #map3 ], iterator_types = [" parallel" , " parallel" ]} ins (%3 , %arg2 : tensor <128 x128 xf32 >, tensor <f32 >) outs (%0 : tensor <128 x128 xf8 E4 M3 FNUZ>) {
515+ %30 = linalg.generic {index ing_maps = [affine_map <( d0 , d1 ) -> ( d0 , d1 )>, affine_map <( d0 , d1 ) -> ()>, affine_map <( d0 , d1 ) -> ( d0 , d1 )> ], iterator_types = [" parallel" , " parallel" ]} ins (%29 , %23 : tensor <128 x128 xf32 >, tensor <f32 >) outs (%24 : tensor <128 x128 xf8 E4 M3 FNUZ>) {
471516 ^bb0 (%in: f32 , %in_2: f32 , %out: f8E4M3FNUZ ):
472- %5 = arith.truncf %in : f32 to f8E4M3FNUZ
473- %6 = arith.truncf %in_2 : f32 to f8E4M3FNUZ
474- %7 = arith.divf %5 , %6 : f8E4M3FNUZ
475- %8 = arith.cmpf ult , %7 , %cst_0 : f8E4M3FNUZ
476- %9 = arith.select %8 , %cst_0 , %7 : f8E4M3FNUZ
477- %10 = arith.cmpf ugt , %9 , %cst_1 : f8E4M3FNUZ
478- %11 = arith.select %10 , %cst_1 , %9 : f8E4M3FNUZ
479- linalg.yield %11 : f8E4M3FNUZ
517+ %31 = arith.truncf %in : f32 to f8E4M3FNUZ
518+ %32 = arith.truncf %in_2 : f32 to f8E4M3FNUZ
519+ %33 = arith.divf %31 , %32 : f8E4M3FNUZ
520+ %34 = arith.cmpf ult , %33 , %cst_0 : f8E4M3FNUZ
521+ %35 = arith.select %34 , %cst_0 , %33 : f8E4M3FNUZ
522+ %36 = arith.cmpf ugt , %35 , %cst_1 : f8E4M3FNUZ
523+ %37 = arith.select %36 , %cst_1 , %35 : f8E4M3FNUZ
524+ linalg.yield %37 : f8E4M3FNUZ
480525 } -> tensor <128 x128 xf8 E4 M3 FNUZ>
481- return %4 : tensor <128 x128 xf8 E4 M3 FNUZ>
526+ iree_tensor_ext.dispatch.tensor.store %30 , %16 , offsets = [0 , 0 ], sizes = [128 , 128 ], strides = [1 , 1 ] : tensor <128 x128 xf8 E4 M3 FNUZ> -> !iree_tensor_ext.dispatch.tensor <writeonly :tensor <128 x128 xf8 E4 M3 FNUZ>>
527+ return
482528}
483- // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 1, 64]{{\]}}>
484- // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
485- // CHECK: func.func @test_dyn_reduction
486- // CHECK-SAME: translation_info = #[[$TRANSLATION]]
487- // CHECK: linalg.generic
488- // CHECK-SAME: lowering_config = #[[$CONFIG]]
529+
530+ // CHECK: #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
531+ // CHECK-SAME: workgroup_size = [2, 1, 1] subgroup_size = 64,
0 commit comments