@@ -1164,3 +1164,63 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
11641164 } {mapping = [#gpu.thread <linear_dim_0 >]}
11651165 return
11661166}
1167+
1168+ // -----
1169+
1170+ // Test: Non-outermost dimension padding with in_bounds = [false, false].
1171+ // Source: 4x6, dest: 4x8. Dim 1 has padding (6 → 8).
1172+ // Raw buffer OOB is linear/1D, so for non-outermost dim OOB, we must
1173+ // replace the outermost index with sourceShape[0] to force hardware OOB.
1174+ //
1175+ // Without the fix: reading at [0, 6] computes a byte offset within the
1176+ // buffer and wraps to [1, 0] instead of returning 0.
1177+ // With the fix: when srcIndices[1] >= 6, srcIndices[0] is replaced with 4
1178+ // (source dim 0 size), guaranteeing linear offset >= buffer size → returns 0.
1179+
1180+ #executable_target_rocm_hsaco_fb_pad = #hal.executable.target <" rocm" ,
1181+ " rocm-hsaco-fb" , {iree_codegen.target_info = #iree_gpu.target <
1182+ arch = " gfx950" , features = " " , wgp = <
1183+ compute = fp32 , storage = b32 , subgroup = none , dot = none , mma = [], subgroup_size_choices = [32 , 32 ],
1184+ max_workgroup_sizes = [1024 , 1024 , 1024 ],
1185+ max_thread_count_per_workgroup = 1024 ,
1186+ max_workgroup_memory_bytes = 65536 ,
1187+ max_workgroup_counts = [2147483647 , 2147483647 , 2147483647 ],
1188+ max_load_instruction_bits = 128 , simds_per_wgp = 4 ,
1189+ vgpr_space_bits = 8192 , dma_sizes = [32 ]>>}>
1190+
1191+ #translation_32_pad = #iree_codegen.translation_info <pipeline = LLVMGPUTileAndFuse workgroup_size = [32 , 1 , 1 ] subgroup_size = 32 >
1192+
1193+ // CHECK-LABEL: func.func @gather_dma_non_outermost_oob_check
1194+ // CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]]: memref<4x6xf32, #amdgpu.address_space<fat_raw_buffer>>
1195+ // CHECK-SAME: %[[DST:[a-zA-Z0-9]+]]: memref<4x8xf32, #gpu.address_space<workgroup>>
1196+ func.func @gather_dma_non_outermost_oob_check (
1197+ %source: memref <4 x6 xf32 , #amdgpu.address_space <fat_raw_buffer >>,
1198+ %dest: memref <4 x8 xf32 , #gpu.address_space <workgroup >>)
1199+ attributes {
1200+ hal.executable.target = #executable_target_rocm_hsaco_fb_pad ,
1201+ translation_info = #translation_32_pad } {
1202+ // CHECK: scf.forall (%[[LANE_ID:[a-zA-Z0-9]+]]) in (32)
1203+ scf.forall (%arg6 ) in (32 ) {
1204+ // CHECK: %[[C1:[a-zA-Z0-9_]+]] = arith.constant 1
1205+ // CHECK: %[[LANE_OFFSET:[a-zA-Z0-9_]+]] = arith.muli %[[LANE_ID]], %[[C1]]
1206+ //
1207+ // Transfer 1: linearOffset = 0
1208+ // CHECK: %[[C0:.+]] = arith.constant 0 : index
1209+ // CHECK: %[[SRC_LIN0:.+]] = arith.addi %[[C0]], %[[LANE_OFFSET]]
1210+ // CHECK: %[[SRC_DELIN0:.+]]:2 = affine.delinearize_index %[[SRC_LIN0]] into (4, 8)
1211+ // CHECK: %[[DST_DELIN0:.+]]:2 = affine.delinearize_index %[[C0]] into (4, 8)
1212+ //
1213+ // Bounds check: compare srcIndices[1] >= 6 (source dim 1 size)
1214+ // CHECK: %[[C6:.+]] = arith.constant 6 : index
1215+ // CHECK: %[[OOB:.+]] = arith.cmpi uge, %[[SRC_DELIN0]]#1, %[[C6]] : index
1216+ // Replace outermost index with 4 (source dim 0 size) to force hardware OOB
1217+ // CHECK: %[[C4_OOB:.+]] = arith.constant 4 : index
1218+ // CHECK: %[[FIXED_IDX:.+]] = arith.select %[[OOB]], %[[C4_OOB]], %[[SRC_DELIN0]]#0 : index
1219+ // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED_IDX]], %[[SRC_DELIN0]]#1], %[[DST]][%[[DST_DELIN0]]#0, %[[DST_DELIN0]]#1] : vector<1xf32>
1220+ // CHECK-NOT: iree_gpu.coalesced_gather_dma
1221+ iree_gpu.coalesced_gather_dma %source into %dest lane (%arg6 ) in_bounds [false , false ] :
1222+ memref <4 x6 xf32 , #amdgpu.address_space <fat_raw_buffer >>,
1223+ memref <4 x8 xf32 , #gpu.address_space <workgroup >>, index
1224+ } {mapping = [#gpu.thread <linear_dim_0 >]}
1225+ return
1226+ }
0 commit comments