@@ -1091,25 +1091,22 @@ func.func @lower_coalesced_dma_with_in_bounds(
10911091// - 64 lanes (one subgroup)
10921092// - in_bounds = [false, true]: K-dim may OOB (last tile 121 % 4 = 1), N-dim is aligned
10931093//
1094- // With 64 lanes and 4x64 dest shape:
1095- // - Elements per lane = 64 / 64 = 1 (each lane reads 1 f32 )
1094+ // With 64 lanes, 4x64 dest shape, and dma_sizes = [32, 128] :
1095+ // - Elements per lane = 256 / 64 = 4 (each lane reads 4xf32 = 128 bits )
10961096// - Delinearization basis = (4, 64)
1097- // - 4 transfers per lane (one per row)
1098- //
1099- // This verifies correct row access pattern: all 4 rows (0-3) are accessed,
1100- // not just row 0 repeated 4 times (which was the bug before the fix).
1097+ // - 1 transfer covers all 256 elements
11011098
11021099#executable_target_rocm_hsaco_fb_unaligned = #hal.executable.target <" rocm" ,
11031100 " rocm-hsaco-fb" , {iree_codegen.target_info = #iree_gpu.target <
1104- arch = " gfx942 " , features = " " , wgp = <
1101+ arch = " gfx950 " , features = " " , wgp = <
11051102 compute = fp32 , storage = b32 , subgroup = shuffle , dot = none , mma = [],
11061103 subgroup_size_choices = [64 , 64 ],
11071104 max_workgroup_sizes = [1024 , 1024 , 1024 ],
11081105 max_thread_count_per_workgroup = 1024 ,
11091106 max_workgroup_memory_bytes = 65536 ,
11101107 max_workgroup_counts = [2147483647 , 2147483647 , 2147483647 ],
11111108 max_load_instruction_bits = 128 , simds_per_wgp = 4 ,
1112- vgpr_space_bits = 8192 , dma_sizes = [32 ]>>}>
1109+ vgpr_space_bits = 8192 , dma_sizes = [32 , 128 ]>>}>
11131110
11141111#translation_64_unaligned = #iree_codegen.translation_info <pipeline = LLVMGPUTileAndFuse workgroup_size = [64 , 1 , 1 ] subgroup_size = 64 >
11151112
@@ -1124,12 +1121,12 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
11241121 translation_info = #translation_64_unaligned } {
11251122 // CHECK: scf.forall (%[[LANE_ID:[a-zA-Z0-9]+]]) in (64)
11261123 scf.forall (%arg6 ) in (64 ) {
1127- // Each lane reads 1 element (64 elements / 64 lanes = 1 ).
1128- // CHECK: %[[C1 :[a-zA-Z0-9_]+]] = arith.constant 1 : index
1129- // CHECK: %[[LANE_OFFSET:[a-zA-Z0-9_]+]] = arith.muli %[[LANE_ID]], %[[C1 ]]
1124+ // Each lane reads 4 elements (256 elements / 64 lanes = 4 ).
1125+ // CHECK: %[[C4 :[a-zA-Z0-9_]+]] = arith.constant 4 : index
1126+ // CHECK: %[[LANE_OFFSET:[a-zA-Z0-9_]+]] = arith.muli %[[LANE_ID]], %[[C4 ]]
11301127 //
1131- // 4 transfers with delinearization basis (4, 64):
1132- // Transfer 1: linearOffset = 0, accesses row 0
1128+ // 1 transfer with delinearization basis (4, 64):
1129+ // Transfer 1: linearOffset = 0
11331130 // CHECK: %[[C0:.+]] = arith.constant 0 : index
11341131 // CHECK: %[[SRC_LIN0:.+]] = arith.addi %[[C0]], %[[LANE_OFFSET]]
11351132 // CHECK: %[[SRC_DELIN0:.+]]:2 = affine.delinearize_index %[[SRC_LIN0]] into (4, 64)
@@ -1138,37 +1135,7 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
11381135 // CHECK: %[[FALSE0:.+]] = arith.constant false
11391136 // CHECK: %[[DIM0:.+]] = memref.dim %[[SRC]], %{{.+}}
11401137 // CHECK: %[[FIXED0:.+]] = arith.select %[[FALSE0]], %[[DIM0]], %[[SRC_DELIN0]]#0
1141- // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED0]], %[[SRC_DELIN0]]#1], %[[DST]][%[[DST_DELIN0]]#0, %[[DST_DELIN0]]#1] : vector<1xf32>
1142- //
1143- // Transfer 2: linearOffset = 64, accesses row 1
1144- // CHECK: %[[C64:.+]] = arith.constant 64 : index
1145- // CHECK: %[[SRC_LIN64:.+]] = arith.addi %[[C64]], %[[LANE_OFFSET]]
1146- // CHECK: %[[SRC_DELIN64:.+]]:2 = affine.delinearize_index %[[SRC_LIN64]] into (4, 64)
1147- // CHECK: %[[DST_DELIN64:.+]]:2 = affine.delinearize_index %[[C64]] into (4, 64)
1148- // CHECK: %[[FALSE1:.+]] = arith.constant false
1149- // CHECK: %[[DIM1:.+]] = memref.dim %[[SRC]], %{{.+}}
1150- // CHECK: %[[FIXED1:.+]] = arith.select %[[FALSE1]], %[[DIM1]], %[[SRC_DELIN64]]#0
1151- // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED1]], %[[SRC_DELIN64]]#1], %[[DST]][%[[DST_DELIN64]]#0, %[[DST_DELIN64]]#1] : vector<1xf32>
1152- //
1153- // Transfer 3: linearOffset = 128, accesses row 2
1154- // CHECK: %[[C128:.+]] = arith.constant 128 : index
1155- // CHECK: %[[SRC_LIN128:.+]] = arith.addi %[[C128]], %[[LANE_OFFSET]]
1156- // CHECK: %[[SRC_DELIN128:.+]]:2 = affine.delinearize_index %[[SRC_LIN128]] into (4, 64)
1157- // CHECK: %[[DST_DELIN128:.+]]:2 = affine.delinearize_index %[[C128]] into (4, 64)
1158- // CHECK: %[[FALSE2:.+]] = arith.constant false
1159- // CHECK: %[[DIM2:.+]] = memref.dim %[[SRC]], %{{.+}}
1160- // CHECK: %[[FIXED2:.+]] = arith.select %[[FALSE2]], %[[DIM2]], %[[SRC_DELIN128]]#0
1161- // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED2]], %[[SRC_DELIN128]]#1], %[[DST]][%[[DST_DELIN128]]#0, %[[DST_DELIN128]]#1] : vector<1xf32>
1162- //
1163- // Transfer 4: linearOffset = 192, accesses row 3
1164- // CHECK: %[[C192:.+]] = arith.constant 192 : index
1165- // CHECK: %[[SRC_LIN192:.+]] = arith.addi %[[C192]], %[[LANE_OFFSET]]
1166- // CHECK: %[[SRC_DELIN192:.+]]:2 = affine.delinearize_index %[[SRC_LIN192]] into (4, 64)
1167- // CHECK: %[[DST_DELIN192:.+]]:2 = affine.delinearize_index %[[C192]] into (4, 64)
1168- // CHECK: %[[FALSE3:.+]] = arith.constant false
1169- // CHECK: %[[DIM3:.+]] = memref.dim %[[SRC]], %{{.+}}
1170- // CHECK: %[[FIXED3:.+]] = arith.select %[[FALSE3]], %[[DIM3]], %[[SRC_DELIN192]]#0
1171- // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED3]], %[[SRC_DELIN192]]#1], %[[DST]][%[[DST_DELIN192]]#0, %[[DST_DELIN192]]#1] : vector<1xf32>
1138+ // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED0]], %[[SRC_DELIN0]]#1], %[[DST]][%[[DST_DELIN0]]#0, %[[DST_DELIN0]]#1] : vector<4xf32>
11721139 // CHECK-NOT: amdgpu.gather_to_lds
11731140 // CHECK-NOT: iree_gpu.coalesced_gather_dma
11741141 iree_gpu.coalesced_gather_dma %source into %dest lane (%arg6 ) in_bounds [false , true ] :
@@ -1199,7 +1166,7 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
11991166 max_workgroup_memory_bytes = 65536 ,
12001167 max_workgroup_counts = [2147483647 , 2147483647 , 2147483647 ],
12011168 max_load_instruction_bits = 128 , simds_per_wgp = 4 ,
1202- vgpr_space_bits = 8192 , dma_sizes = [32 ]>>}>
1169+ vgpr_space_bits = 8192 , dma_sizes = [32 , 128 ]>>}>
12031170
12041171#translation_32_pad = #iree_codegen.translation_info <pipeline = LLVMGPUTileAndFuse workgroup_size = [32 , 1 , 1 ] subgroup_size = 32 >
12051172
0 commit comments