From fd77066ab96a31b5a44f5c0f7ff509d6c092b622 Mon Sep 17 00:00:00 2001 From: Justin Rosner Date: Fri, 3 Oct 2025 16:01:35 +0000 Subject: [PATCH 1/2] [EXTERNAL] Fix v_mov_b16_t16 index in folding pass --- .../llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 4 ++- .../llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 26 +++++++++++++++++++ .../llvm/lib/Target/AMDGPU/SIInstrInfo.h | 1 + .../llvm/test/CodeGen/AMDGPU/true16-fold.mir | 26 +++++++++++++++++++ 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/external/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/external/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 94c5a63d802d..35a185f0aa57 100644 --- a/external/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/external/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -931,7 +931,9 @@ static MachineOperand *lookUpCopyChain(const SIInstrInfo &TII, for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg); SubDef && TII.isFoldableCopy(*SubDef); SubDef = MRI.getVRegDef(Sub->getReg())) { - MachineOperand &SrcOp = SubDef->getOperand(1); + unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef); + MachineOperand &SrcOp = SubDef->getOperand(SrcIdx); + if (SrcOp.isImm()) return &SrcOp; if (!SrcOp.isReg() || SrcOp.getReg().isPhysical()) diff --git a/external/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/external/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 7adaedb2b655..2aa50da14341 100644 --- a/external/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/external/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3479,6 +3479,32 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { } } +unsigned SIInstrInfo::getFoldableCopySrcIdx(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::V_MOV_B16_t16_e32: + case AMDGPU::V_MOV_B16_t16_e64: + return 2; + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::V_MOV_B64_e32: + case AMDGPU::V_MOV_B64_e64: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::S_MOV_B64_IMM_PSEUDO: + case AMDGPU::COPY: + case AMDGPU::WWM_COPY: + case AMDGPU::V_ACCVGPR_WRITE_B32_e64: + case AMDGPU::V_ACCVGPR_READ_B32_e64: + case AMDGPU::V_ACCVGPR_MOV_B32: + case AMDGPU::AV_MOV_B32_IMM_PSEUDO: + case AMDGPU::AV_MOV_B64_IMM_PSEUDO: + return 1; + default: + llvm_unreachable("MI is not a foldable copy"); + } +} + static constexpr AMDGPU::OpName ModifierOpNames[] = { AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp, diff --git a/external/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/external/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h index bcea296c3604..969d5184a482 100644 --- a/external/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/external/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -433,6 +433,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { const MachineInstr &MIb) const override; static bool isFoldableCopy(const MachineInstr &MI); + static unsigned getFoldableCopySrcIdx(const MachineInstr &MI); void removeModOperands(MachineInstr &MI) const; diff --git a/external/llvm-project/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/external/llvm-project/llvm/test/CodeGen/AMDGPU/true16-fold.mir index 25443b50cc4a..2347cf81a33b 100644 --- a/external/llvm-project/llvm/test/CodeGen/AMDGPU/true16-fold.mir +++ b/external/llvm-project/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -59,6 +59,7 @@ body: | %4:vgpr_16 = COPY %3:sgpr_lo16 %5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec S_ENDPGM 0, implicit %5 +... --- name: fold_16bit_madmix_clamp @@ -197,3 +198,28 @@ body: | $vgpr0 = COPY %4 S_ENDPGM 0, implicit $vgpr0 ... + +--- +name: fold_imm16_across_reg_sequence +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_imm16_across_reg_sequence + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[V_MOV_B16_t16_e64_1]], %subreg.hi16 + ; CHECK-NEXT: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e64 0, -1, 0, -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F32_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec + %1:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec + %2:vgpr_32 = REG_SEQUENCE %0, %subreg.lo16, %1, %subreg.hi16 + %3:vgpr_32 = nofpexcept V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %3 + S_ENDPGM 0, implicit $vgpr0 +... + From c11b4e0a125ddf5ccfc7b04d7e010fe77a0ecf69 Mon Sep 17 00:00:00 2001 From: Justin Rosner Date: Fri, 3 Oct 2025 16:02:30 +0000 Subject: [PATCH 2/2] Add e2e test for failing MIGraphX CI case --- .../mixr-attention-small-decode.mlir | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 mlir/test/fusion/pr-e2e/attention/mixr-attention-small-decode.mlir diff --git a/mlir/test/fusion/pr-e2e/attention/mixr-attention-small-decode.mlir b/mlir/test/fusion/pr-e2e/attention/mixr-attention-small-decode.mlir new file mode 100644 index 000000000000..4be43056186c --- /dev/null +++ b/mlir/test/fusion/pr-e2e/attention/mixr-attention-small-decode.mlir @@ -0,0 +1,39 @@ +// This is a design that was in the MIGraphX CI that was previously failing +// here: https://ontrack-internal.amd.com/browse/SWDEV-558297 + +// RUN: rocmlir-gen -fut mlir_attention --arch %arch --clone-harness %s | rocmlir-driver -kernel-pipeline=migraphx | rocmlir-driver -host-pipeline=migraphx,highlevel | rocmlir-gen -ph -rand 1 -rand_type float -fut mlir_attention_wrapper --verifier clone - | rocmlir-driver -host-pipeline mhal -kernel-pipeline full | xmir-runner --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext,%conv_validation_wrapper_library_dir/libconv-validation-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_float16_utils%shlibext,%linalg_test_lib_dir/libmlir_c_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_async_runtime%shlibext --entry-point-result=void | FileCheck %s + +module { + // CHECK: [1 1 1] + func.func @mlir_attention( + %arg0: !migraphx.shaped<1x1x12xf16, 12x12x1>, + %arg1: !migraphx.shaped<1x2x4x2xf16, 16x8x2x1>, + %arg2: !migraphx.shaped<1x2x4x2xf16, 16x8x2x1>, + %arg3: !migraphx.shaped<1x1xsi32, 1x1> + ) -> !migraphx.shaped<1x1x4xf16, 4x4x1> attributes {arch = "", kernel = "mixr", num_cu = 0 : i64} { + %0 = migraphx.literal(dense<[0, 1, 2, 3]> : tensor<4xsi32>) : <4xsi32, 1> + %1 = migraphx.literal(dense<0xFC00> : tensor<1xf16>) : <1xf16, 1> + %2 = migraphx.literal(dense<1.000000e+00> : tensor<1xf16>) : <1xf16, 1> + %3 = migraphx.reshape %arg0 {dims = [1, 1, 6, 2]} : <1x1x12xf16, 12x12x1> -> <1x1x6x2xf16, 12x12x2x1> + %4 = migraphx.transpose %3 {permutation = [0, 2, 1, 3]} : <1x1x6x2xf16, 12x12x2x1> -> <1x6x1x2xf16, 12x2x12x1> + %5 = migraphx.multibroadcast %arg3 {out_dyn_dims = [], out_lens = [1, 2]} : <1x1xsi32, 1x1> -> <1x2xsi32, 1x0> + %6 = migraphx.slice %4 {axes = [1], ends = [2], starts = [0]} : <1x6x1x2xf16, 12x2x12x1> -> <1x2x1x2xf16, 12x2x12x1> + %7 = migraphx.transpose %arg1 {permutation = [0, 1, 3, 2]} : <1x2x4x2xf16, 16x8x2x1> -> <1x2x2x4xf16, 16x8x1x2> + %8 = migraphx.dot %6, %7 : <1x2x1x2xf16, 12x2x12x1>, <1x2x2x4xf16, 16x8x1x2> -> <1x2x1x4xf16, 8x4x4x1> + %9 = migraphx.multibroadcast %0 {out_dyn_dims = [], out_lens = [1, 2, 1, 4]} : <4xsi32, 1> -> <1x2x1x4xsi32, 0x0x0x1> + %10 = migraphx.multibroadcast %1 {out_dyn_dims = [], out_lens = [1, 2, 1, 4]} : <1xf16, 1> -> <1x2x1x4xf16, 0x0x0x0> + %11 = migraphx.multibroadcast %2 {out_dyn_dims = [], out_lens = [1, 2, 1, 4]} : <1xf16, 1> -> <1x2x1x4xf16, 0x0x0x0> + %12 = migraphx.mul %8, %11 : <1x2x1x4xf16, 8x4x4x1>, <1x2x1x4xf16, 0x0x0x0> -> <1x2x1x4xf16, 8x4x4x1> + %13 = migraphx.reshape %5 {dims = [1, 2, 1, 1]} : <1x2xsi32, 1x0> -> <1x2x1x1xsi32, 2x1x1x1> + %14 = migraphx.multibroadcast %13 {out_dyn_dims = [], out_lens = [1, 2, 1, 4]} : <1x2x1x1xsi32, 2x1x1x1> -> <1x2x1x4xsi32, 2x1x1x0> + %15 = migraphx.greater %9, %14 : <1x2x1x4xsi32, 0x0x0x1>, <1x2x1x4xsi32, 2x1x1x0> -> <1x2x1x4xsi32, 8x4x4x1> + %16 = migraphx.convert %15 {target_type = 0 : i64} : <1x2x1x4xsi32, 8x4x4x1> to <1x2x1x4xsi8, 8x4x4x1> + %17 = migraphx.where %16, %10, %12 : <1x2x1x4xsi8, 8x4x4x1>, <1x2x1x4xf16, 0x0x0x0>, <1x2x1x4xf16, 8x4x4x1> -> <1x2x1x4xf16, 8x4x4x1> + %18 = migraphx.softmax %17 {axis = 3 : i64} : <1x2x1x4xf16, 8x4x4x1> -> <1x2x1x4xf16, 8x4x4x1> + %19 = migraphx.dot %18, %arg2 : <1x2x1x4xf16, 8x4x4x1>, <1x2x4x2xf16, 16x8x2x1> -> <1x2x1x2xf16, 4x2x2x1> + %20 = migraphx.transpose %19 {permutation = [0, 2, 1, 3]} : <1x2x1x2xf16, 4x2x2x1> -> <1x1x2x2xf16, 4x2x2x1> + %21 = migraphx.reshape %20 {dims = [1, 1, 4]} : <1x1x2x2xf16, 4x2x2x1> -> <1x1x4xf16, 4x4x1> + return %21 : !migraphx.shaped<1x1x4xf16, 4x4x1> + } +} +