11// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-convert-to-coalesced-dma,canonicalize))" %s --split-input-file | FileCheck %s
22
3- #gpu_target_copy = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
3+ #gpu_target_copy = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
44 compute = fp32 , storage = b32 , subgroup = shuffle ,
55 max_load_instruction_bits = 128 , subgroup_size_choices = [32 ],
66 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -49,7 +49,7 @@ func.func @copy(%source: tensor<64x512xf32>, %init: tensor<64x512xf32>) -> tenso
4949
5050// -----
5151
52- #gpu_target_gather = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
52+ #gpu_target_gather = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
5353 compute = fp32 , storage = b32 , subgroup = shuffle ,
5454 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
5555 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -103,7 +103,7 @@ func.func @gather(%source: tensor<64x512xf32>, %indices: tensor<64xi32>, %init:
103103// Negative test: Skip coalesced DMA when innermost dimension < subgroup size. This is to ensure we do not go down
104104// the slow path (which is not implemented yet).
105105
106- #gpu_target_small_inner = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
106+ #gpu_target_small_inner = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
107107 compute = fp32 , storage = b32 , subgroup = shuffle ,
108108 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
109109 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -176,7 +176,7 @@ func.func @copy_not_aligned_to_dma(%source_buffer: memref<320xbf16, #amdgpu.addr
176176// - Instead, we should tile rows to 16 (64/4) and keep columns whole (128)
177177// This ensures subviews are contiguous in memory.
178178
179- #gpu_target_contiguous = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
179+ #gpu_target_contiguous = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
180180 compute = fp32 , storage = b32 , subgroup = shuffle ,
181181 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
182182 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -236,7 +236,7 @@ func.func @copy_prefer_contiguous_subview(%source: tensor<64x128xf32>, %init: te
236236// When output comes from tensor.empty(), we can use total elements instead of
237237// innermost dimension for the size check, enabling coalesced DMA.
238238
239- #gpu_target_linearize = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
239+ #gpu_target_linearize = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
240240 compute = fp32 , storage = b32 , subgroup = shuffle ,
241241 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
242242 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -296,7 +296,7 @@ func.func @copy_small_innermost_linearized(%source: tensor<128x16xf32>) -> tenso
296296// Test: 1D tensor copy distributes warps across the single dimension.
297297// This tests the 1D tile size computation logic for flattened copies.
298298
299- #gpu_target_1d = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
299+ #gpu_target_1d = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
300300 compute = fp32 , storage = b32 , subgroup = shuffle ,
301301 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
302302 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -358,7 +358,7 @@ func.func @copy_1d_tensor(%source: tensor<2048xf32>) -> tensor<2048xf32>
358358// 1. Innermost dim (16) < minElementsPerTransfer (64)
359359// 2. Output is a function argument, not tensor.empty, so we can't linearize
360360
361- #gpu_target_no_linearize = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
361+ #gpu_target_no_linearize = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
362362 compute = fp32 , storage = b32 , subgroup = shuffle ,
363363 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
364364 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -395,7 +395,7 @@ func.func @copy_small_innermost_no_linearize(%source: tensor<128x16xf32>, %dest:
395395// The copy should be converted to coalesced DMA when the input comes from an
396396// extract_slice with contiguous innermost dimensions.
397397
398- #gpu_target_extract_input = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
398+ #gpu_target_extract_input = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
399399 compute = fp32 , storage = b32 , subgroup = shuffle ,
400400 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
401401 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -451,7 +451,7 @@ func.func @copy_with_extract_slice_input(%large_source: tensor<256x128xf32>) ->
451451// When linalg.copy reads from tensor.pad, trace through to the original source
452452// and set in_bounds attribute based on padding.
453453
454- #gpu_target_pad = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
454+ #gpu_target_pad = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
455455 compute = fp32 , storage = b32 , subgroup = shuffle ,
456456 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
457457 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -467,24 +467,24 @@ func.func @copy_with_extract_slice_input(%large_source: tensor<256x128xf32>) ->
467467// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<4x64xf32>
468468func.func @copy_with_tensor_pad_fusion (%source: tensor <121 x64 xf32 >, %init: tensor <4 x64 xf32 >, %off: index , %sz: index , %high: index ) -> tensor <4 x64 xf32 >
469469 attributes {hal.executable.target = #exec_target_pad , translation_info = #translation_pad } {
470- // Extract a dynamic slice
470+ // Extract a dynamic slice.
471471 %extracted = tensor.extract_slice %source [%off , 0 ] [%sz , 64 ] [1 , 1 ]
472472 : tensor <121 x64 xf32 > to tensor <?x64 xf32 >
473473
474- // Pad to static size (only M dimension has padding)
474+ // Pad to static size (only M dimension has padding).
475475 %cst = arith.constant 0.0 : f32
476476 %padded = tensor.pad %extracted low [0 , 0 ] high [%high , 0 ] {
477477 ^bb0 (%arg0: index , %arg1: index ):
478478 tensor.yield %cst : f32
479479 } : tensor <?x64 xf32 > to tensor <4 x64 xf32 >
480480
481- // Copy from padded tensor
481+ // Copy from padded tensor.
482482 %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma }
483483 ins (%padded : tensor <4 x64 xf32 >)
484484 outs (%init : tensor <4 x64 xf32 >) -> tensor <4 x64 xf32 >
485485
486- // Key check: tensor.pad is fused - source is the extract_slice result, not the padded tensor
487- // in_bounds = [false, true] because M dim has dynamic padding, K dim has no padding
486+ // Key check: tensor.pad is fused - source is the extract_slice result, not the padded tensor.
487+ // in_bounds = [false, true] because M dim has dynamic padding, K dim has no padding.
488488 // CHECK: %[[EXTRACTED:.+]] = tensor.extract_slice %[[SRC]]
489489 // CHECK: scf.forall {{.*}} shared_outs(%[[OUTER_INIT:.+]] = %[[INIT]])
490490 // CHECK: scf.forall (%[[LANE:.+]]) in (64) shared_outs(%[[INNER_INIT:.+]] = %[[OUTER_INIT]])
@@ -504,7 +504,7 @@ func.func @copy_with_tensor_pad_fusion(%source: tensor<121x64xf32>, %init: tenso
504504// operates on the full padded buffer shape, not on smaller subviews.
505505// This is critical for correct delinearization in the lowering pass.
506506
507- #gpu_target_pad_multi_warp = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
507+ #gpu_target_pad_multi_warp = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
508508 compute = fp32 , storage = b32 , subgroup = shuffle ,
509509 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
510510 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
@@ -520,18 +520,18 @@ func.func @copy_with_tensor_pad_fusion(%source: tensor<121x64xf32>, %init: tenso
520520// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<4x64xf32>
521521func.func @copy_with_tensor_pad_fusion_multi_warp (%source: tensor <121 x64 xf32 >, %init: tensor <4 x64 xf32 >, %off: index , %sz: index , %high: index ) -> tensor <4 x64 xf32 >
522522 attributes {hal.executable.target = #exec_target_pad_multi_warp , translation_info = #translation_pad_multi_warp } {
523- // Extract a dynamic slice
523+ // Extract a dynamic slice.
524524 %extracted = tensor.extract_slice %source [%off , 0 ] [%sz , 64 ] [1 , 1 ]
525525 : tensor <121 x64 xf32 > to tensor <?x64 xf32 >
526526
527- // Pad to static size (only M dimension has padding)
527+ // Pad to static size (only M dimension has padding).
528528 %cst = arith.constant 0.0 : f32
529529 %padded = tensor.pad %extracted low [0 , 0 ] high [%high , 0 ] {
530530 ^bb0 (%arg0: index , %arg1: index ):
531531 tensor.yield %cst : f32
532532 } : tensor <?x64 xf32 > to tensor <4 x64 xf32 >
533533
534- // Copy from padded tensor with 4 warps (256/64=4)
534+ // Copy from padded tensor with 4 warps (256/64=4).
535535 %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma }
536536 ins (%padded : tensor <4 x64 xf32 >)
537537 outs (%init : tensor <4 x64 xf32 >) -> tensor <4 x64 xf32 >
@@ -570,7 +570,7 @@ func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %
570570// If a DWORD is partially out-of-bounds, the entire DWORD returns zero,
571571// causing incorrect results. We bail out to avoid the slow path.
572572
573- #gpu_target_pad_unaligned = #iree_gpu.target <arch = " gfx942 " , features = " " , wgp = <
573+ #gpu_target_pad_unaligned = #iree_gpu.target <arch = " gfx950 " , features = " " , wgp = <
574574 compute = fp32 , storage = b32 , subgroup = shuffle ,
575575 max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
576576 max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
0 commit comments