11// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-convert-to-coalesced-dma,canonicalize))" %s --split-input-file | FileCheck %s
22
3- #gpu_target_copy = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
3+ #gpu_target_copy = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
44 compute = fp32 , storage = b32 , subgroup = shuffle ,
55 max_load_instruction_bits = 128 , subgroup_size_choices = [32 ],
66 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -49,7 +49,7 @@ func.func @copy(%source: tensor<64x512xf32>, %init: tensor<64x512xf32>) -> tenso
4949
5050// -----
5151
52- #gpu_target_gather = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
52+ #gpu_target_gather = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
5353 compute = fp32 , storage = b32 , subgroup = shuffle ,
5454 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
5555 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -103,7 +103,7 @@ func.func @gather(%source: tensor<64x512xf32>, %indices: tensor<64xi32>, %init:
103103// Test: Skip coalesced DMA when innermost dimension < subgroup size. This is to ensure we do not go down
104104// the slow path (which is not implemented yet).
105105
106- #gpu_target_small_inner = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
106+ #gpu_target_small_inner = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
107107 compute = fp32 , storage = b32 , subgroup = shuffle ,
108108 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
109109 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -140,7 +140,7 @@ func.func @copy_small_innermost_dim(%source: tensor<64x32xf32>, %init: tensor<64
140140// - Instead, we should tile rows to 16 (64/4) and keep columns whole (128)
141141// This ensures subviews are contiguous in memory.
142142
143- #gpu_target_contiguous = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
143+ #gpu_target_contiguous = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
144144 compute = fp32 , storage = b32 , subgroup = shuffle ,
145145 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
146146 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -200,7 +200,7 @@ func.func @copy_prefer_contiguous_subview(%source: tensor<64x128xf32>, %init: te
200200// When output comes from tensor.empty(), we can use total elements instead of
201201// innermost dimension for the size check, enabling coalesced DMA.
202202
203- #gpu_target_linearize = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
203+ #gpu_target_linearize = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
204204 compute = fp32 , storage = b32 , subgroup = shuffle ,
205205 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
206206 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -260,7 +260,7 @@ func.func @copy_small_innermost_linearized(%source: tensor<128x16xf32>) -> tenso
260260// Test: 1D tensor copy distributes warps across the single dimension.
261261// This tests the 1D tile size computation logic for flattened copies.
262262
263- #gpu_target_1d = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
263+ #gpu_target_1d = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
264264 compute = fp32 , storage = b32 , subgroup = shuffle ,
265265 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
266266 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -322,7 +322,7 @@ func.func @copy_1d_tensor(%source: tensor<2048xf32>) -> tensor<2048xf32>
322322// 1. Innermost dim (16) < minElementsPerTransfer (64)
323323// 2. Output is a function argument, not tensor.empty, so we can't linearize
324324
325- #gpu_target_no_linearize = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
325+ #gpu_target_no_linearize = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
326326 compute = fp32 , storage = b32 , subgroup = shuffle ,
327327 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
328328 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -359,7 +359,7 @@ func.func @copy_small_innermost_no_linearize(%source: tensor<128x16xf32>, %dest:
359359// The copy should be converted to coalesced DMA when the input comes from an
360360// extract_slice with contiguous innermost dimensions.
361361
362- #gpu_target_extract_input = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
362+ #gpu_target_extract_input = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
363363 compute = fp32 , storage = b32 , subgroup = shuffle ,
364364 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
365365 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -415,7 +415,7 @@ func.func @copy_with_extract_slice_input(%large_source: tensor<256x128xf32>) ->
415415// When linalg.copy reads from tensor.pad, trace through to the original source
416416// and set in_bounds attribute based on padding.
417417
418- #gpu_target_pad = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
418+ #gpu_target_pad = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
419419 compute = fp32 , storage = b32 , subgroup = shuffle ,
420420 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
421421 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -431,24 +431,24 @@ func.func @copy_with_extract_slice_input(%large_source: tensor<256x128xf32>) ->
431431// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<4x64xf32>
432432func.func @copy_with_tensor_pad_fusion (%source: tensor <121 x64 xf32 >, %init: tensor <4 x64 xf32 >, %off: index , %sz: index , %high: index ) -> tensor <4 x64 xf32 >
433433 attributes {hal.executable.target = #exec_target_pad , translation_info = #translation_pad } {
434- // Extract a dynamic slice
434+ // Extract a dynamic slice.
435435 %extracted = tensor.extract_slice %source [%off , 0 ] [%sz , 64 ] [1 , 1 ]
436436 : tensor <121 x64 xf32 > to tensor <?x64 xf32 >
437437
438- // Pad to static size (only M dimension has padding)
438+ // Pad to static size (only M dimension has padding).
439439 %cst = arith.constant 0.0 : f32
440440 %padded = tensor.pad %extracted low [0 , 0 ] high [%high , 0 ] {
441441 ^bb0 (%arg0: index , %arg1: index ):
442442 tensor.yield %cst : f32
443443 } : tensor <?x64 xf32 > to tensor <4 x64 xf32 >
444444
445- // Copy from padded tensor
445+ // Copy from padded tensor.
446446 %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma }
447447 ins (%padded : tensor <4 x64 xf32 >)
448448 outs (%init : tensor <4 x64 xf32 >) -> tensor <4 x64 xf32 >
449449
450- // Key check: tensor.pad is fused - source is the extract_slice result, not the padded tensor
451- // in_bounds = [false, true] because M dim has dynamic padding, K dim has no padding
450+ // Key check: tensor.pad is fused - source is the extract_slice result, not the padded tensor.
451+ // in_bounds = [false, true] because M dim has dynamic padding, K dim has no padding.
452452 // CHECK: %[[EXTRACTED:.+]] = tensor.extract_slice %[[SRC]]
453453 // CHECK: scf.forall {{.*}} shared_outs(%[[OUTER_INIT:.+]] = %[[INIT]])
454454 // CHECK: scf.forall (%[[LANE:.+]]) in (64) shared_outs(%[[INNER_INIT:.+]] = %[[OUTER_INIT]])
@@ -468,7 +468,7 @@ func.func @copy_with_tensor_pad_fusion(%source: tensor<121x64xf32>, %init: tenso
468468// operates on the full padded buffer shape, not on smaller subviews.
469469// This is critical for correct delinearization in the lowering pass.
470470
471- #gpu_target_pad_multi_warp = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
471+ #gpu_target_pad_multi_warp = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
472472 compute = fp32 , storage = b32 , subgroup = shuffle ,
473473 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
474474 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -484,18 +484,18 @@ func.func @copy_with_tensor_pad_fusion(%source: tensor<121x64xf32>, %init: tenso
484484// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<4x64xf32>
485485func.func @copy_with_tensor_pad_fusion_multi_warp (%source: tensor <121 x64 xf32 >, %init: tensor <4 x64 xf32 >, %off: index , %sz: index , %high: index ) -> tensor <4 x64 xf32 >
486486 attributes {hal.executable.target = #exec_target_pad_multi_warp , translation_info = #translation_pad_multi_warp } {
487- // Extract a dynamic slice
487+ // Extract a dynamic slice.
488488 %extracted = tensor.extract_slice %source [%off , 0 ] [%sz , 64 ] [1 , 1 ]
489489 : tensor <121 x64 xf32 > to tensor <?x64 xf32 >
490490
491- // Pad to static size (only M dimension has padding)
491+ // Pad to static size (only M dimension has padding).
492492 %cst = arith.constant 0.0 : f32
493493 %padded = tensor.pad %extracted low [0 , 0 ] high [%high , 0 ] {
494494 ^bb0 (%arg0: index , %arg1: index ):
495495 tensor.yield %cst : f32
496496 } : tensor <?x64 xf32 > to tensor <4 x64 xf32 >
497497
498- // Copy from padded tensor with 4 warps (256/64=4)
498+ // Copy from padded tensor with 4 warps (256/64=4).
499499 %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma }
500500 ins (%padded : tensor <4 x64 xf32 >)
501501 outs (%init : tensor <4 x64 xf32 >) -> tensor <4 x64 xf32 >
@@ -534,7 +534,7 @@ func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %
534534// If a DWORD is partially out-of-bounds, the entire DWORD returns zero,
535535// causing incorrect results. We bail out to avoid the slow path.
536536
537- #gpu_target_pad_unaligned = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
537+ #gpu_target_pad_unaligned = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
538538 compute = fp32 , storage = b32 , subgroup = shuffle ,
539539 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
540540 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
0 commit comments