@@ -526,3 +526,53 @@ func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %
526526
527527 return %result : tensor <4 x64 xf32 >
528528}
529+
530+ // -----
531+
532+ // Test: tensor.pad fusion bails out when source row size is not DWORD-aligned.
533+ // On AMD CDNA, per-component range checking is performed for each DWORD.
534+ // If a DWORD is partially out-of-bounds, the entire DWORD returns zero,
535+ // causing incorrect results. We bail out to avoid the slow path.
536+
537+ #gpu_target_pad_unaligned = #iree_gpu.target <arch = " gfx942" , features = " " , wgp = <
538+ compute = fp32 , storage = b32 , subgroup = shuffle ,
539+ max_load_instruction_bits = 128 , subgroup_size_choices = [64 ],
540+ max_workgroup_sizes = [1024 , 1024 , 1024 ], max_thread_count_per_workgroup = 1024 ,
541+ max_workgroup_memory_bytes = 65536 , max_workgroup_counts = [2147483647 , 2147483647 , 2147483647 ],
542+ dma_sizes = [32 , 128 ]
543+ >>
544+
545+ #exec_target_pad_unaligned = #hal.executable.target <" rocm" , " rocm-hsaco-fb" , {iree_codegen.target_info = #gpu_target_pad_unaligned }>
546+ #translation_pad_unaligned = #iree_codegen.translation_info <pipeline = LLVMGPUTileAndFuse workgroup_size = [64 , 1 , 1 ] subgroup_size = 64 , {gpu_pipeline_options = #iree_gpu.pipeline_options <prefetch_num_stages = 2 , no_reduce_shared_memory_bank_conflicts = true , use_igemm_convolution = false >}>
547+
548+ // CHECK-LABEL: func.func @copy_with_tensor_pad_unaligned_row
549+ // CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]]: tensor<65x121xf16>
550+ // CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<4x124xf16>
551+ func.func @copy_with_tensor_pad_unaligned_row (%source: tensor <65 x121 xf16 >, %init: tensor <4 x124 xf16 >, %off: index , %sz: index , %high_m: index ) -> tensor <4 x124 xf16 >
552+ attributes {hal.executable.target = #exec_target_pad_unaligned , translation_info = #translation_pad_unaligned } {
553+ // Extract a dynamic slice: tensor<?x121xf16>
554+ // Row size = 121 * 2 bytes = 242 bytes, NOT 4-byte aligned
555+ %extracted = tensor.extract_slice %source [%off , 0 ] [%sz , 121 ] [1 , 1 ]
556+ : tensor <65 x121 xf16 > to tensor <?x121 xf16 >
557+
558+ // Pad to static size
559+ %cst = arith.constant 0.0 : f16
560+ %padded = tensor.pad %extracted low [0 , 0 ] high [%high_m , 3 ] {
561+ ^bb0 (%arg0: index , %arg1: index ):
562+ tensor.yield %cst : f16
563+ } : tensor <?x121 xf16 > to tensor <4 x124 xf16 >
564+
565+ // Copy from padded tensor
566+ %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma }
567+ ins (%padded : tensor <4 x124 xf16 >)
568+ outs (%init : tensor <4 x124 xf16 >) -> tensor <4 x124 xf16 >
569+
570+ // Source row size (121 * 2 = 242 bytes) is not DWORD-aligned.
571+ // Coalesced DMA bails out to avoid partial OOB in per-DWORD range checking.
572+ // The linalg.copy should remain unchanged.
573+ // CHECK: tensor.pad
574+ // CHECK: linalg.copy
575+ // CHECK-NOT: iree_gpu.coalesced_gather_dma
576+
577+ return %result : tensor <4 x124 xf16 >
578+ }
0 commit comments