From 0e3554fc153d50e489d45c33aaee26a017c358bf Mon Sep 17 00:00:00 2001 From: Fabian Stuckmann Date: Tue, 13 May 2025 05:56:18 -0600 Subject: [PATCH 1/6] [AIEX] unit-tests Global Combiners --- .../global-combiners/MBB-crossing.mir | 207 ++++++++++++++++++ .../GlobalIsel/global-combiners/gemm.mir | 81 +++++++ .../mixture-offset-postinc-selection.mir | 48 ++++ .../global-combiners/movability-check.mir | 50 +++++ .../global-combiners/overlap-gain.mir | 78 +++++++ .../global-combiners/post-inc-eagerness.mir | 75 +++++++ .../global-combiners/reorder-Mem-Instrs.mir | 45 ++++ .../shared-postinc-constants.mir | 58 +++++ .../global-combiners/user-intrinsics.mir | 51 +++++ 9 files changed, 693 insertions(+) create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/mixture-offset-postinc-selection.mir create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/movability-check.mir create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/overlap-gain.mir create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/shared-postinc-constants.mir create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir new file mode 100644 index 000000000000..2b7726405ab9 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir @@ -0,0 +1,207 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p -verify-machineinstrs \ +# RUN: --start-before=aie2p-postlegalizer-generic-combiner \ +# RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: -o - %s | FileCheck %s + +# Consider MBB Crossing in Selection Decision + +# Check out of block use of %0(p0) in BB.1 +--- +name: MBB-crossing +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: MBB-crossing + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 192 + ; CHECK-NEXT: G_STORE [[C1]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[C1]](s32), [[COPY]](p0), [[C]](s20) :: (store (s32)) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: G_STORE [[C1]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr + bb.0: + liveins: $p0 + %0:_(p0) = COPY $p0 + %1:_(s20) = G_CONSTANT i20 64 + %2:_(s32) = G_CONSTANT i32 192 + %3:_(p0) = G_PTR_ADD %0, %1(s20) + G_STORE %2(s32), %0(p0) :: (store (s32)) + G_STORE %2(s32), %3(p0) :: (store (s32)) + bb.1: + G_STORE %2, %0(p0) :: (store (s32)) + PseudoRET implicit $lr +... + +# %3(p0) is defined (bb.0) outside of the use in MBB bb.1 +--- +name: def-outside-loop-dominance +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: def-outside-loop-dominance + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %6(s32), %bb.1, [[C2]](s32), %bb.0 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[C3]] + ; CHECK-NEXT: G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[ADD]](s32), [[COPY]](p0), [[C]](s20) :: (store (s32)) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[ADD]](s32), [[C1]] + ; CHECK-NEXT: G_BRCOND [[ICMP]](s32), %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.0: + liveins: $p0 + %0:_(p0) = COPY $p0 + %1:_(s20) = G_CONSTANT i20 64 + %2:_(s32) = G_CONSTANT i32 0 + %3:_(p0) = G_PTR_ADD %0, %1(s20) + %4:_(s32) = G_CONSTANT i32 4 + bb.1: + %5:_(s32) = G_PHI %7(s32), %bb.1, %4(s32), %bb.0 + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s32) = nsw G_ADD %5, %6 + G_STORE %7(s32), %0(p0) :: (store (s32)) + G_STORE %7(s32), %3(p0) :: (store (s32)) + %8:_(s32) = G_ICMP intpred(sgt), %7(s32), %2 + G_BRCOND %8(s32), %bb.1 + bb.2: + PseudoRET implicit $lr +... + +# %0 is used to define a G_PTR_ADD (%10) in a different block (bb.2) than where +# %0 is otherwise used (bb.1) +--- +name: ptr-add-after-loop +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: ptr-add-after-loop + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 16 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %6(s32), %bb.1, [[C3]](s32), %bb.0 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[C4]] + ; CHECK-NEXT: G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[ADD]](s32), [[COPY]](p0), [[C2]](s20) :: (store (s32)) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[ADD]](s32), [[C1]] + ; CHECK-NEXT: G_BRCOND [[ICMP]](s32), %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[ADD]](s32), [[COPY]](p0), [[C]](s20) :: (store (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr + bb.0: + liveins: $p0 + %0:_(p0) = COPY $p0 + %1:_(s20) = G_CONSTANT i20 32 + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s20) = G_CONSTANT i20 16 + %4:_(s32) = G_CONSTANT i32 4 + bb.1: + %5:_(s32) = G_PHI %7(s32), %bb.1, %4(s32), %bb.0 + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s32) = nsw G_ADD %5, %6 + G_STORE %7(s32), %0(p0) :: (store (s32)) + %8:_(p0) = G_PTR_ADD %0, %3(s20) + G_STORE %7(s32), %8(p0) :: (store (s32)) + %9:_(s32) = G_ICMP intpred(sgt), %7(s32), %2 + G_BRCOND %9(s32), %bb.1 + bb.2: + %10:_(p0) = G_PTR_ADD %0, %1(s20) + G_STORE %7(s32), %10(p0) :: (store (s32)) + PseudoRET implicit $lr +... + +# The result of G_PTR_ADD (%l1) is used in a phi of the same MBB. +--- +name: phi-ptr-add +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: phi-ptr-add + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %6(s32), %bb.1, [[C2]](s32), %bb.0 + ; CHECK-NEXT: %l0:_(p0) = G_PHI %l1(p0), %bb.1, [[PTR_ADD]](p0), %bb.0 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[C3]] + ; CHECK-NEXT: G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 1 + ; CHECK-NEXT: %l1:_(p0) = G_AIE_POSTINC_STORE [[ADD]](s32), %l0, [[C4]](s20) :: (store (s32)) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[ADD]](s32), [[C1]] + ; CHECK-NEXT: G_BRCOND [[ICMP]](s32), %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.0: + liveins: $p0 + %0:_(p0) = COPY $p0 + %1:_(s20) = G_CONSTANT i20 64 + %2:_(s32) = G_CONSTANT i32 0 + %3:_(p0) = G_PTR_ADD %0, %1(s20) + %4:_(s32) = G_CONSTANT i32 4 + bb.1: + %5:_(s32) = G_PHI %7(s32), %bb.1, %4(s32), %bb.0 + %l0:_(p0) = G_PHI %l1(p0), %bb.1, %3(p0), %bb.0 + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s32) = nsw G_ADD %5, %6 + G_STORE %7(s32), %0(p0) :: (store (s32)) + G_STORE %7(s32), %l0(p0) :: (store (s32)) + %8:_(s20) = G_CONSTANT i20 1 + %l1:_(p0) = G_PTR_ADD %l0, %8 + %9:_(s32) = G_ICMP intpred(sgt), %7(s32), %2 + G_BRCOND %9(s32), %bb.1 + bb.2: + PseudoRET implicit $lr +... diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir new file mode 100644 index 000000000000..0c7b51150ee5 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir @@ -0,0 +1,81 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p -verify-machineinstrs \ +# RUN: --start-before=aie2p-postlegalizer-generic-combiner \ +# RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: -o - %s | FileCheck %s + +# Verify that Gemm has the correct assignment + + +--- +name: gemm-kernel +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: gemm-kernel + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: %configZero:_(s20) = G_LOAD [[COPY]](p0) :: (load (s20), align 4) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C3]](s32), %bb.0, %7(s32), %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(p0) = G_PHI [[COPY]](p0), %bb.0, %9(p0), %bb.1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI]], [[C2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[ADD]](s32), [[C]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[PHI1]](p0), [[C1]](s20), [[C1]](s20), [[C1]](s20), [[C1]](s20), [[C1]](s20), [[C1]](s20), [[C1]](s20) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[PHI1]], %configZero(s20) :: (load (<32 x s16>)) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<32 x s16>) = G_AIE_OFFSET_LOAD [[PHI1]](p0), [[C4]](s20) :: (load (<32 x s16>)) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD2:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_LOAD3:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[AIE_POSTINC_LOAD1]], [[C4]](s20) :: (load (<32 x s16>)) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<32 x s16>) = G_LOAD [[AIE_POSTINC_LOAD3]](p0) :: (load (<32 x s16>)) + ; CHECK-NEXT: G_BRCOND [[ICMP]](s32), %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD]](<32 x s16>), implicit [[AIE_OFFSET_LOAD]](<32 x s16>), implicit [[AIE_POSTINC_LOAD2]](<32 x s16>), implicit [[LOAD]](<32 x s16>) + bb.0: + liveins: $p0, $p1 + %0:_(p0) = COPY $p0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(s20) = G_CONSTANT i20 64 + %3:_(s32) = G_CONSTANT i32 -1 + %4:_(s32) = G_CONSTANT i32 4 + %configZero:_(s20) = G_LOAD %0(p0) :: (load (s20)) + bb.1: + %5:_(s32) = G_PHI %4, %bb.0, %7, %bb.1 + %6:_(p0) = G_PHI %0, %bb.0, %9, %bb.1 + %7:_(s32) = G_ADD %3, %5 + %8:_(s32) = G_ICMP intpred(sgt), %7(s32), %1 + %9:_(p0), %10:_(s20), %11:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %6:_(p0), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20) + %12:_(<32 x s16>) = G_LOAD %6(p0) :: (load (<32 x s16>)) + %13:_(s20) = G_CONSTANT i20 64 + %14:_(p0) = nuw G_PTR_ADD %6, %13(s20) + %15:_(<32 x s16>) = G_LOAD %14(p0) :: (load (<32 x s16>)) + %16:_(p0) = G_PTR_ADD %6, %configZero + %17:_(<32 x s16>) = G_LOAD %16(p0) :: (load (<32 x s16>)) + %18:_(p0) = nuw G_PTR_ADD %16, %13(s20) + %19:_(<32 x s16>) = G_LOAD %18(p0) :: (load (<32 x s16>)) + G_BRCOND %8(s32), %bb.1 + bb.2: + PseudoRET implicit $lr, implicit %12, implicit %15, implicit %17, implicit %19 +... +... + + + diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/mixture-offset-postinc-selection.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/mixture-offset-postinc-selection.mir new file mode 100644 index 000000000000..ab1b8d05bc0c --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/mixture-offset-postinc-selection.mir @@ -0,0 +1,48 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p -verify-machineinstrs \ +# RUN: --start-before=aie2p-postlegalizer-generic-combiner \ +# RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: --aie-address-chaining=true \ +# RUN: -o - %s | FileCheck %s + +# store hinders addr-chaining and thus leads to suboptimal assignments. +# The store should have been a postinc, so that the second load could be a +# Offset Load within Imm range + + +--- +name: select-offset-and-postinc +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $p1 + ; CHECK-LABEL: name: select-offset-and-postinc + ; CHECK: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 16 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[AIE_OFFSET_LOAD]](s32), [[COPY]](p0), [[C]](s20) :: (store (s32)) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_OFFSET_LOAD]](s32), implicit [[AIE_OFFSET_LOAD1]](s32) + %0:_(p0) = COPY $p0 + %1:_(p0) = COPY $p1 + %2:_(s20) = G_CONSTANT i20 16 + %3:_(p0) = nuw G_PTR_ADD %0, %2 + %4:_(s32) = G_LOAD %3(p0) :: (load (s32)) + G_STORE %4, %3(p0) :: (store (s32)) + %5:_(s20) = G_CONSTANT i20 16 + %6:_(p0) = nuw G_PTR_ADD %3, %5 + %7:_(s32) = G_LOAD %6(p0) :: (load (s32)) + PseudoRET implicit $lr, implicit %4, implicit %7 +... diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/movability-check.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/movability-check.mir new file mode 100644 index 000000000000..18dbf15931bc --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/movability-check.mir @@ -0,0 +1,50 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p -verify-machineinstrs \ +# RUN: --start-before=aie2p-postlegalizer-generic-combiner \ +# RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: -o - %s | FileCheck %s + +# Check Movability + +# Check that PTR_ADD which has a lower DAG depth than the CombineRoot (load), +# can be combined +--- +name: gemm-kernel +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: gemm-kernel + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 4 + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD]](s32), implicit [[AIE_POSTINC_LOAD1]](p0) + bb.0: + liveins: $p0, $p1 + %0:_(p0) = COPY $p0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(s20) = G_CONSTANT i20 64 + %3:_(s32) = G_CONSTANT i32 -1 + bb.1: + %5:_(s32) = G_LOAD %0(p0) :: (load (s32)) + %6:_(s20) = G_CONSTANT i20 4 + %7:_(p0) = G_PTR_ADD %0(p0), %6(s20) + PseudoRET implicit $lr, implicit %5, implicit %7 +... +... + + + diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/overlap-gain.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/overlap-gain.mir new file mode 100644 index 000000000000..1d3be84328d8 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/overlap-gain.mir @@ -0,0 +1,78 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p -verify-machineinstrs \ +# RUN: --start-before=aie2p-postlegalizer-generic-combiner \ +# RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: --aie-address-chaining=false \ +# RUN: -o - %s | FileCheck %s + +# Check that there is no conflict between pointer independent post-increments +# and a following pre-increment + +--- +name: check-non-overlap-gain +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: check-non-overlap-gain + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 192 + ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[C1]](s32), [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (s32)) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[C1]](s32), [[AIE_POSTINC_3D_STORE]](p0), [[C2]](s20) :: (store (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr + %0:_(p0) = COPY $p0 + %1:_(s20) = G_CONSTANT i20 64 + %2:_(s32) = G_CONSTANT i32 192 + G_STORE %2(s32), %0(p0) :: (store (s32)) + %3:_(p0), %4:_(s20), %5:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20) + %6:_(s20) = G_CONSTANT i20 8 + %7:_(p0) = G_PTR_ADD %3, %6(s20) + G_STORE %2, %7(p0) :: (store (s32)) + PseudoRET implicit $lr +... + + +# Assign 3d post-increment to later store, so that the pointer does not have to +# be saved +--- +name: overlap-penalty +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: overlap-penalty + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 192 + ; CHECK-NEXT: G_STORE [[C1]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[C1]](s32), [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (s32)) + ; CHECK-NEXT: G_STORE [[C1]](s32), [[AIE_POSTINC_3D_STORE]](p0) :: (store (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr + %0:_(p0) = COPY $p0 + %1:_(s20) = G_CONSTANT i20 64 + %2:_(s32) = G_CONSTANT i32 192 + %3:_(p0), %4:_(s20), %5:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20) + G_STORE %2(s32), %0(p0) :: (store (s32)) + G_STORE %2, %0(p0) :: (store (s32)) + G_STORE %2, %3(p0) :: (store (s32)) + PseudoRET implicit $lr +... + + diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir new file mode 100644 index 000000000000..803e19fa2e0a --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir @@ -0,0 +1,75 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p -verify-machineinstrs \ +# RUN: --start-before=aie2p-postlegalizer-generic-combiner \ +# RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: --aie-address-chaining=false \ +# RUN: -o - %s | FileCheck %s + +# The post-increment with the first memory instruction causes a pointer copy, +# increasing register pressure. +# FIXME: reorder instructions + +--- +name: post-inc-reg-pressure-store +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: post-inc-reg-pressure-store + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 192 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 + ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[C1]](s32), [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (s32)) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[C1]](s32), [[COPY]](p0), [[C2]](s20) :: (store (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_3D_STORE]](p0) + %0:_(p0) = COPY $p0 + %1:_(s20) = G_CONSTANT i20 64 + %2:_(s32) = G_CONSTANT i32 192 + G_STORE %2(s32), %0(p0) :: (store (s32)) + %6:_(s20) = G_CONSTANT i20 8 + %7:_(p0) = G_PTR_ADD %0, %6(s20) + G_STORE %2(s32), %7(p0) :: (store (s32)) + %3:_(p0), %4:_(s20), %5:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20) + PseudoRET implicit $lr, implicit %3 +... + +# similar example as above, but with load instructions +--- +name: post-inc-reg-pressure-load +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: post-inc-reg-pressure-load + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 + ; CHECK-NEXT: %lZero:_(s32), %7:_(p0), %8:_(s20), %9:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) + ; CHECK-NEXT: %lOne:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit %7(p0), implicit %lZero(s32), implicit %lOne(s32) + %0:_(p0) = COPY $p0 + %1:_(s20) = G_CONSTANT i20 64 + %2:_(s32) = G_CONSTANT i32 192 + %lZero:_(s32) = G_LOAD %0(p0) :: (load (s32)) + %6:_(s20) = G_CONSTANT i20 8 + %7:_(p0) = G_PTR_ADD %0, %6(s20) + %lOne:_(s32) = G_LOAD %7(p0) :: (load (s32)) + %3:_(p0), %4:_(s20), %5:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20) + PseudoRET implicit $lr, implicit %3, implicit %lZero, implicit %lOne +... diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir new file mode 100644 index 000000000000..e34bbaa37880 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir @@ -0,0 +1,45 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p -verify-machineinstrs \ +# RUN: --start-before=aie2p-postlegalizer-generic-combiner \ +# RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: --aie-address-chaining=false \ +# RUN: -o - %s | FileCheck %s + +# Reorder Loads, so that the pointer does not have to be restored for the +# second Load Instr +--- +name: reorder-loads +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: reorder-loads + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: %sZero:_(s32), %6:_(p0), %7:_(s20), %8:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) + ; CHECK-NEXT: %sOne:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD %sZero, %sOne + ; CHECK-NEXT: G_STORE [[ADD]](s32), %6(p0) :: (store (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr + %0:_(p0) = COPY $p0 + %1:_(s20) = G_CONSTANT i20 64 + %2:_(s32) = G_CONSTANT i32 192 + %sZero:_(s32) = G_LOAD %0(p0) :: (load (s32)) + %3:_(p0) = G_PTR_ADD %0, %1(s20) + %sOne:_(s32) = G_LOAD %3(p0) :: (load (s32)) + %4:_(p0), %5:_(s20), %6:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20), %1:_(s20) + %7:_(s32) = nsw G_ADD %sZero(s32), %sOne(s32) + G_STORE %7, %4(p0) :: (store (s32)) + PseudoRET implicit $lr +... diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/shared-postinc-constants.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/shared-postinc-constants.mir new file mode 100644 index 000000000000..49a25a9947cd --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/shared-postinc-constants.mir @@ -0,0 +1,58 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p -verify-machineinstrs \ +# RUN: --start-before=aie2p-postlegalizer-generic-combiner \ +# RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: --aie-address-chaining=true -o - %s | FileCheck %s + +# overlay gain: Do not penalize Immediates that can be reused, if the Immediates +# have to be copied to registers because they do not fit the bit encoding length. +# Thus select Post-Increments, since they only have a single copy penalty. + +--- +name: shared-postinc-constants +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $p1, $p2 + ; CHECK-LABEL: name: shared-postinc-constants + ; CHECK: liveins: $p0, $p1, $p2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 16 + ; CHECK-NEXT: [[AIE_POSTINC_ZEXTLOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_ZEXTLOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_ZEXTLOAD [[COPY]], [[C]](s20) :: (load (s8)) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 16 + ; CHECK-NEXT: [[AIE_POSTINC_ZEXTLOAD2:%[0-9]+]]:_(s32), [[AIE_POSTINC_ZEXTLOAD3:%[0-9]+]]:_(p0) = G_AIE_POSTINC_ZEXTLOAD [[AIE_POSTINC_ZEXTLOAD1]], [[C1]](s20) :: (load (s8)) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 16 + ; CHECK-NEXT: [[AIE_POSTINC_ZEXTLOAD4:%[0-9]+]]:_(s32), [[AIE_POSTINC_ZEXTLOAD5:%[0-9]+]]:_(p0) = G_AIE_POSTINC_ZEXTLOAD [[AIE_POSTINC_ZEXTLOAD3]], [[C2]](s20) :: (load (s8)) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 16 + ; CHECK-NEXT: [[AIE_POSTINC_ZEXTLOAD6:%[0-9]+]]:_(s32), [[AIE_POSTINC_ZEXTLOAD7:%[0-9]+]]:_(p0) = G_AIE_POSTINC_ZEXTLOAD [[AIE_POSTINC_ZEXTLOAD5]], [[C3]](s20) :: (load (s8)) + ; CHECK-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[AIE_POSTINC_ZEXTLOAD7]](p0) :: (load (s8)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_ZEXTLOAD]](s32), implicit [[AIE_POSTINC_ZEXTLOAD2]](s32), implicit [[AIE_POSTINC_ZEXTLOAD4]](s32), implicit [[AIE_POSTINC_ZEXTLOAD6]](s32), implicit [[ZEXTLOAD]](s32) + %0:_(p0) = COPY $p0 + %2:_(p0) = COPY $p2 + %3:_(s20) = G_CONSTANT i20 16 + %4:_(p0) = nuw G_PTR_ADD %0, %3(s20) + %5:_(s32) = G_ZEXTLOAD %0(p0) :: (load (s8)) + %6:_(s32) = G_ZEXTLOAD %4(p0) :: (load (s8)) + %7:_(s20) = G_CONSTANT i20 44 + %22:_(s20) = G_CONSTANT i20 16 + %10:_(p0) = nuw G_PTR_ADD %4, %22(s20) + %23:_(s20) = G_CONSTANT i20 16 + %12:_(p0) = nuw G_PTR_ADD %10, %23(s20) + %13:_(s32) = G_ZEXTLOAD %10(p0) :: (load (s8)) + %14:_(s32) = G_ZEXTLOAD %12(p0) :: (load (s8)) + %24:_(s20) = G_CONSTANT i20 16 + %16:_(p0) = nuw G_PTR_ADD %12, %24(s20) + %17:_(s32) = G_ZEXTLOAD %16(p0) :: (load (s8)) + PseudoRET implicit $lr, implicit %5, implicit %6, implicit %13, implicit %14, implicit %17 +... + diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir new file mode 100644 index 000000000000..2455684ee8a6 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir @@ -0,0 +1,51 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p -verify-machineinstrs \ +# RUN: --start-before=aie2p-postlegalizer-generic-combiner \ +# RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: --aie-address-chaining=false \ +# RUN: -o - %s | FileCheck %s + +# properly assign user intrinsics + +--- +name: assign-user-intrinsics +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0, $p1 + ; CHECK-LABEL: name: assign-user-intrinsics + ; CHECK: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 + ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD4:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_LOAD5:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD6:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD7:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY1]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_3D_LOAD]](s32), implicit [[AIE_OFFSET_LOAD]](s32), implicit [[AIE_POSTINC_3D_LOAD4]](s32), implicit [[AIE_OFFSET_LOAD1]](s32), implicit [[AIE_POSTINC_3D_LOAD1]](p0), implicit [[AIE_POSTINC_3D_LOAD5]](p0) + %0:_(p0) = COPY $p0 + %1:_(p0) = COPY $p1 + %2:_(s20) = G_CONSTANT i20 8 + %3:_(s32) = G_LOAD %0(p0) :: (load (s32)) + %4:_(p0) = nuw G_PTR_ADD %0, %2 + %5:_(s32) = G_LOAD %4(p0) :: (load (s32)) + %6:_(s32) = G_LOAD %1(p0) :: (load (s32)) + %7:_(p0) = nuw G_PTR_ADD %1, %2 + %8:_(s32) = G_LOAD %7(p0) :: (load (s32)) + %9:_(p0), %10:_(s20), %11:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %0:_(p0), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20) + %12:_(p0), %13:_(s20), %14:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), %1:_(p0), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20), %2:_(s20) + PseudoRET implicit $lr, implicit %3, implicit %5, implicit %6, implicit %8, implicit %9, implicit %12 +... + + + From f21c70f6936d6374a3454406a2daa41704acd204 Mon Sep 17 00:00:00 2001 From: Fabian Stuckmann Date: Thu, 22 May 2025 09:34:40 -0600 Subject: [PATCH 2/6] [AIEX][NFC] refactor Pointer Modifier Support --- llvm/lib/Target/AIE/AIE2InstrInfo.cpp | 18 ++++ llvm/lib/Target/AIE/AIE2InstrInfo.h | 2 + llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp | 87 ++++++++++++++++ llvm/lib/Target/AIE/AIEBaseInstrInfo.h | 38 +++++++ llvm/lib/Target/AIE/AIECombinerHelper.cpp | 101 ++++++++----------- llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp | 52 ++++++++++ llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h | 1 + 7 files changed, 242 insertions(+), 57 deletions(-) diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp index 11200aa23e19..67bbbf9f5863 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp @@ -1503,6 +1503,24 @@ bool AIE2InstrInfo::isOffsetInImmediateRange( } } +namespace { +static const std::map> S20Consumers = { + {Intrinsic::aie2_add_2d, {4, 5, 6, 7}}, + {Intrinsic::aie2_add_3d, {5, 6, 7, 8, 9, 10, 11}}}; + +static const std::map> + PtrInputAndOutputIdx = {{Intrinsic::aie2_add_2d, {3, 0}}, + {Intrinsic::aie2_add_3d, {4, 0}}}; + +static const AIEBaseInstrInfo::PTRModSupport AIE2PTRModSupport{ + &S20Consumers, &PtrInputAndOutputIdx}; + +} // namespace + +const AIEBaseInstrInfo::PTRModSupport &AIE2InstrInfo::getPTRModSupport() const { + return AIE2PTRModSupport; +} + unsigned AIE2InstrInfo::getPseudoJNZDOpcode() const { return AIE2::PseudoJNZD; } unsigned AIE2InstrInfo::getNumBypassedCycles(const InstrItineraryData *ItinData, diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.h b/llvm/lib/Target/AIE/AIE2InstrInfo.h index e248975634e8..783f9f78db3e 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.h +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.h @@ -109,6 +109,8 @@ class AIE2InstrInfo : public AIE2GenInstrInfo { isOffsetInImmediateRange(unsigned Opcode, unsigned LoadStoreSize, std::optional Immediate) const override; + const PTRModSupport &getPTRModSupport() const override; + virtual unsigned getPseudoJNZDOpcode() const override; unsigned getNumBypassedCycles(const InstrItineraryData *ItinData, diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp b/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp index e8eb8f864c80..531955554fd8 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp @@ -23,6 +23,7 @@ #include "Utils/AIELoopUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -1288,3 +1289,89 @@ AIEBaseInstrInfo::getAlignmentBoundaries(MachineBasicBlock &MBB) const { return AlgnCandidates; } + +bool llvm::AIEBaseInstrInfo::PTRModSupport::isNativeS20Consumer( + const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_PTR_ADD: + return true; + case TargetOpcode::G_INTRINSIC: + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: { + const unsigned IntrinsicID = cast(MI).getIntrinsicID(); + return S20Consumers->count(IntrinsicID); + } + + default: + return false; + } +} + +bool llvm::AIEBaseInstrInfo::PTRModSupport::isNativeS20Operand( + const MachineInstr &MI, unsigned OperandIdx) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_PTR_ADD: + return true; + case TargetOpcode::G_INTRINSIC: + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: { + const unsigned IntrinsicID = cast(MI).getIntrinsicID(); + return isNativeS20ConsumerIntrinsicOperand(IntrinsicID, OperandIdx); + } + default: + return false; + } +} + +bool llvm::AIEBaseInstrInfo::PTRModSupport::isNativeS20ConsumerIntrinsicOperand( + const unsigned IntrinsicID, unsigned OperandIdx) const { + auto It = S20Consumers->find(IntrinsicID); + if (It == S20Consumers->end()) + return false; + + const std::set &Indices = It->second; + return Indices.find(OperandIdx) != Indices.end(); +} + +std::optional llvm::AIEBaseInstrInfo::PTRModSupport::getInputPtrIdx( + const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_PTR_ADD: + return 1; + case TargetOpcode::G_INTRINSIC: + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: { + const unsigned IntrinsicID = cast(MI).getIntrinsicID(); + return getInputPtrIdx(IntrinsicID); + } + default: + return {}; + } +} + +unsigned llvm::AIEBaseInstrInfo::PTRModSupport::getInputPtrIdx( + const unsigned OpCode) const { + auto It = PtrInputAndOutputIdx->find(OpCode); + assert(It != PtrInputAndOutputIdx->end()); + return It->second.first; +} + +std::optional llvm::AIEBaseInstrInfo::PTRModSupport::getOutputPtrIdx( + const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_PTR_ADD: + return 0; + case TargetOpcode::G_INTRINSIC: + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: { + const unsigned IntrinsicID = cast(MI).getIntrinsicID(); + return getOutputPtrIdx(IntrinsicID); + } + + default: + return {}; + } +} + +unsigned llvm::AIEBaseInstrInfo::PTRModSupport::getOutputPtrIdx( + const unsigned OpCode) const { + auto It = PtrInputAndOutputIdx->find(OpCode); + assert(It != PtrInputAndOutputIdx->end()); + return It->second.second; +} diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h index f8e08e86a8b1..4d9d6cb7764c 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h @@ -27,6 +27,7 @@ #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/Support/ErrorHandling.h" #include +#include namespace llvm { @@ -64,6 +65,43 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { llvm_unreachable("Target didn't implement OffsetFitImmRange"); } + class PTRModSupport { + /// Map between GIntrinsic and the OperandIndices that consume S20 Operands + const std::map> *S20Consumers = nullptr; + /// Map between GIntrinsic/Opcode and the OperandIndices + const std::map> + *PtrInputAndOutputIdx = nullptr; + + /// \return whether \p OperandIdx of \p IntrinsicId is a native S20 Operand + bool isNativeS20ConsumerIntrinsicOperand(const unsigned IntrinsicID, + const unsigned OperandIdx) const; + + unsigned getInputPtrIdx(const unsigned OpCode) const; + unsigned getOutputPtrIdx(const unsigned OpCode) const; + + public: + PTRModSupport(const std::map> *S20Consumers, + const std::map> + *PtrInputAndOutputIdx) + : S20Consumers(S20Consumers), + PtrInputAndOutputIdx(PtrInputAndOutputIdx) {} + + /// \return whether \p MI consumes S20 + bool isNativeS20Consumer(const MachineInstr &MI) const; + + /// \return whether \p OperandIdx of \p MI is a native S20 operand + bool isNativeS20Operand(const MachineInstr &MI, unsigned OperandIdx) const; + + std::optional getInputPtrIdx(const MachineInstr &MI) const; + + std::optional getOutputPtrIdx(const MachineInstr &MI) const; + }; + + /// Return PointerModifierSupport Class for querying + virtual const PTRModSupport &getPTRModSupport() const { + llvm_unreachable("Target didn't implement getPTRModSupport"); + } + /// Return the opcode for a return instruction virtual unsigned getReturnOpcode() const { llvm_unreachable("Target didn't implement getReturnOpcode"); diff --git a/llvm/lib/Target/AIE/AIECombinerHelper.cpp b/llvm/lib/Target/AIE/AIECombinerHelper.cpp index 47af221c98d1..98e997215598 100644 --- a/llvm/lib/Target/AIE/AIECombinerHelper.cpp +++ b/llvm/lib/Target/AIE/AIECombinerHelper.cpp @@ -69,6 +69,11 @@ static unsigned getNumMaskUndefs(const ArrayRef &Mask, return Count; } +static const AIEBaseInstrInfo *getInstrInfo(const MachineInstr &MI) { + return static_cast( + MI.getMF()->getSubtarget().getInstrInfo()); +} + bool MaskMatch::isValidMask(const ArrayRef Mask) const { for (unsigned Idx = 0; Idx < Mask.size(); ++Idx) { if (Mask[Idx] == -1) @@ -773,57 +778,27 @@ static bool canProduceS20(const MachineRegisterInfo &MRI, } } -/// Checks if the intrinsic natively consumes S20 for scalar inputs. -static bool isNativeS20ConsumerIntrinsic(const unsigned IntrinsicID, - std::optional OperandIdx) { - static const std::map> S20OpIndices = { - {Intrinsic::aie2_add_2d, {4, 5, 6, 7}}, - {Intrinsic::aie2_add_3d, {5, 6, 7, 8, 9, 10, 11}}, - {Intrinsic::aie2p_add_2d, {4, 5, 6, 7}}, - {Intrinsic::aie2p_add_3d, {5, 6, 7, 8, 9, 10, 11}}, - {Intrinsic::aie2p_fifo_st_flush_1d, {7}}, - {Intrinsic::aie2p_fifo_st_flush_1d_conv, {7}}, - {Intrinsic::aie2p_fifo_ld_pop_1d_unaligned, {8}}, - {Intrinsic::aie2p_fifo_st_flush_2d, {8, 9, 10, 11}}, - {Intrinsic::aie2p_fifo_st_flush_2d_conv, {8, 9, 10, 11}}, - {Intrinsic::aie2p_fifo_ld_pop_544_1d_bfp16, {9}}, - {Intrinsic::aie2p_fifo_ld_pop_576_1d_bfp16, {9}}, - {Intrinsic::aie2p_fifo_ld_pop_2d_unaligned, {9, 10, 11, 12}}, - {Intrinsic::aie2p_fifo_st_flush_3d, {9, 10, 11, 12, 13, 14, 15}}, - {Intrinsic::aie2p_fifo_st_flush_3d_conv, {9, 10, 11, 12, 13, 14, 15}}, - {Intrinsic::aie2p_fifo_ld_pop_544_2d_bfp16, {10, 11, 12, 13}}, - {Intrinsic::aie2p_fifo_ld_pop_576_2d_bfp16, {10, 11, 12, 13}}, - {Intrinsic::aie2p_fifo_ld_pop_3d_unaligned, {10, 11, 12, 13, 14, 15, 16}}, - {Intrinsic::aie2p_fifo_ld_pop_544_3d_bfp16, {11, 12, 13, 14, 15, 16, 17}}, - {Intrinsic::aie2p_fifo_ld_pop_576_3d_bfp16, - {11, 12, 13, 14, 15, 16, 17}}}; - - auto It = S20OpIndices.find(IntrinsicID); - if (It == S20OpIndices.end()) - return false; - - if (!OperandIdx) { - return true; - } - const std::set &Indices = It->second; - return Indices.find(*OperandIdx) != Indices.end(); -} - -/// Checks if the instruction natively consumes S20 for scalar inputs. -static bool -isNativeS20Consumer(const MachineInstr &MI, - std::optional OperandIdx = std::nullopt) { - switch (MI.getOpcode()) { - case TargetOpcode::G_PTR_ADD: - return true; - case TargetOpcode::G_INTRINSIC: - case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: { - const unsigned IntrinsicID = cast(MI).getIntrinsicID(); - return isNativeS20ConsumerIntrinsic(IntrinsicID, OperandIdx); - } - - default: - return false; +/// Returns all MachineOperand Indices that are a use of +/// the specific register. It further tightens the search criteria to a use +/// that kills the register if IsKill is true. +static std::vector +findAllRegisterUseOperandIdx(MachineInstr &MI, Register Reg, + bool IsKill = false, + const TargetRegisterInfo *TRI = nullptr) { + { + std::vector UseIndices; + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + const MachineOperand &MO = MI.getOperand(I); + if (!MO.isReg() || !MO.isUse()) + continue; + Register MOReg = MO.getReg(); + if (!MOReg) + continue; + if (MOReg == Reg || (TRI && Reg && MOReg && TRI->regsOverlap(MOReg, Reg))) + if (!IsKill || MO.isKill()) + UseIndices.push_back(I); + } + return UseIndices; } } @@ -856,6 +831,9 @@ bool canNarrowUserTreeToS20(MachineRegisterInfo &MRI, InstrNode Start, return false; } + const auto *TII = getInstrInfo(MI); + auto &PtrModSupport = TII->getPTRModSupport(); + // Now check if users can be adapted to consume an S20 input assert(MI.getNumExplicitDefs() == 1); Register DefReg = MI.getOperand(0).getReg(); @@ -881,10 +859,15 @@ bool canNarrowUserTreeToS20(MachineRegisterInfo &MRI, InstrNode Start, return false; continue; default: - if (isNativeS20Consumer(Use, Use.findRegisterUseOperandIdx(DefReg))) - continue; - LLVM_DEBUG(dbgs() << " User cannot consume S20: " << Use); - return false; + // FIXME: check every Use of DefReg in Use + auto UseIndices = findAllRegisterUseOperandIdx(Use, DefReg); + assert(!UseIndices.empty()); + for (auto &Idx : UseIndices) { + if (!PtrModSupport.isNativeS20Operand(Use, Idx)) { + LLVM_DEBUG(dbgs() << " User cannot consume S20: " << Use); + return false; + } + } } } LLVM_DEBUG(dbgs() << " Can be narrowed: " << MI); @@ -1005,7 +988,9 @@ bool getOperandsToNarrow(MachineInstr &MI, MachineRegisterInfo &MRI, bool llvm::matchS20NarrowingOpt(MachineInstr &MI, MachineRegisterInfo &MRI, std::set &ValidStartNodes) { - if (!EnableS20Narrowing || !isNativeS20Consumer(MI)) + auto *TII = getInstrInfo(MI); + auto &PtrModSupport = TII->getPTRModSupport(); + if (!EnableS20Narrowing || !PtrModSupport.isNativeS20Consumer(MI)) return false; return getOperandsToNarrow(MI, MRI, ValidStartNodes); } @@ -1064,7 +1049,9 @@ bool modifyToS20(InstrNode Start, MachineRegisterInfo &MRI, MachineIRBuilder &B, } // Easy case - if (isNativeS20Consumer(*StartNodeMI)) + const auto *TII = getInstrInfo(*StartNodeMI); + auto &PtrModSupport = TII->getPTRModSupport(); + if (PtrModSupport.isNativeS20Consumer(*StartNodeMI)) return true; LLVM_DEBUG(dbgs() << "Narrow operand of :" << *StartNodeMI); diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp index bbf4255cf616..92029fcdfabc 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp @@ -1814,6 +1814,58 @@ bool AIE2PInstrInfo::isOffsetInImmediateRange( } } +namespace { +static const std::map> S20Consumers = { + {Intrinsic::aie2p_add_2d, {4, 5, 6, 7}}, + {Intrinsic::aie2p_add_2d, {4, 5, 6, 7}}, + {Intrinsic::aie2p_add_3d, {5, 6, 7, 8, 9, 10, 11}}, + {Intrinsic::aie2p_fifo_st_flush_1d, {7}}, + {Intrinsic::aie2p_fifo_st_flush_1d_conv, {7}}, + {Intrinsic::aie2p_fifo_ld_pop_1d_unaligned, {8}}, + {Intrinsic::aie2p_fifo_st_flush_2d, {8, 9, 10, 11}}, + {Intrinsic::aie2p_fifo_st_flush_2d_conv, {8, 9, 10, 11}}, + {Intrinsic::aie2p_fifo_ld_pop_544_1d_bfp16, {9}}, + {Intrinsic::aie2p_fifo_ld_pop_576_1d_bfp16, {9}}, + {Intrinsic::aie2p_fifo_ld_pop_2d_unaligned, {9, 10, 11, 12}}, + {Intrinsic::aie2p_fifo_st_flush_3d, {9, 10, 11, 12, 13, 14, 15}}, + {Intrinsic::aie2p_fifo_st_flush_3d_conv, {9, 10, 11, 12, 13, 14, 15}}, + {Intrinsic::aie2p_fifo_ld_pop_544_2d_bfp16, {10, 11, 12, 13}}, + {Intrinsic::aie2p_fifo_ld_pop_576_2d_bfp16, {10, 11, 12, 13}}, + {Intrinsic::aie2p_fifo_ld_pop_3d_unaligned, {10, 11, 12, 13, 14, 15, 16}}, + {Intrinsic::aie2p_fifo_ld_pop_544_3d_bfp16, {11, 12, 13, 14, 15, 16, 17}}, + {Intrinsic::aie2p_fifo_ld_pop_576_3d_bfp16, {11, 12, 13, 14, 15, 16, 17}}}; + +static const std::map> + PtrInputAndOutputIdx = { + {Intrinsic::aie2p_add_2d, {3, 0}}, + {Intrinsic::aie2p_add_3d, {4, 0}}, + {Intrinsic::aie2p_fifo_st_flush_1d, {3, 0}}, + {Intrinsic::aie2p_fifo_st_flush_1d_conv, {3, 0}}, + {Intrinsic::aie2p_fifo_ld_pop_1d_unaligned, {3, 0}}, + {Intrinsic::aie2p_fifo_st_flush_2d, {4, 0}}, + {Intrinsic::aie2p_fifo_st_flush_2d_conv, {4, 0}}, + {Intrinsic::aie2p_fifo_ld_pop_544_1d_bfp16, {3, 0}}, + {Intrinsic::aie2p_fifo_ld_pop_576_1d_bfp16, {3, 0}}, + {Intrinsic::aie2p_fifo_ld_pop_2d_unaligned, {4, 0}}, + {Intrinsic::aie2p_fifo_st_flush_3d, {5, 0}}, + {Intrinsic::aie2p_fifo_st_flush_3d_conv, {5, 0}}, + {Intrinsic::aie2p_fifo_ld_pop_544_2d_bfp16, {4, 0}}, + {Intrinsic::aie2p_fifo_ld_pop_576_2d_bfp16, {4, 0}}, + {Intrinsic::aie2p_fifo_ld_pop_3d_unaligned, {5, 0}}, + {Intrinsic::aie2p_fifo_ld_pop_544_3d_bfp16, {5, 0}}, + {Intrinsic::aie2p_fifo_ld_pop_576_3d_bfp16, {5, 0}}, +}; + +static const AIEBaseInstrInfo::PTRModSupport AIE2PPTRModSupport{ + &S20Consumers, &PtrInputAndOutputIdx}; + +} // namespace + +const AIEBaseInstrInfo::PTRModSupport & +AIE2PInstrInfo::getPTRModSupport() const { + return AIE2PPTRModSupport; +} + unsigned AIE2PInstrInfo::getGenericAddVectorEltOpcode() const { return AIE2P::G_AIE_ADD_VECTOR_ELT_HI; } diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h index 0aa292324442..387d8b19cbe2 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h @@ -113,6 +113,7 @@ class AIE2PInstrInfo : public AIE2PGenInstrInfo { bool isOffsetInImmediateRange(unsigned Opcode, unsigned LoadStoreSize, std::optional Immediate) const override; + virtual const PTRModSupport &getPTRModSupport() const override; unsigned getNumBypassedCycles(const InstrItineraryData *ItinData, const MachineInstr &DefMI, unsigned DefIdx, const MachineInstr &UseMI, From aac20f1f6a603bf57ab0f521cc6bd10fedee60be Mon Sep 17 00:00:00 2001 From: Fabian Stuckmann Date: Tue, 17 Jun 2025 06:24:52 -0600 Subject: [PATCH 3/6] [AIEX][NFC] Refactor existing pre/postinc combiners --- llvm/lib/Target/AIE/AIE.h | 2 + .../AIE/AIE2PostLegalizerCustomCombiner.cpp | 20 +- llvm/lib/Target/AIE/AIE2TargetMachine.cpp | 3 + llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp | 4 + llvm/lib/Target/AIE/AIECombine.td | 7 +- llvm/lib/Target/AIE/AIECombinerHelper.cpp | 214 +++++++++++++----- llvm/lib/Target/AIE/AIECombinerHelper.h | 38 ++-- llvm/lib/Target/AIE/AIEGlobalCombiner.cpp | 147 ++++++++++++ llvm/lib/Target/AIE/AIEGlobalCombiner.h | 111 +++++++++ llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp | 160 +++++++++++++ llvm/lib/Target/AIE/AIEPtrModOptimizer.h | 100 ++++++++ llvm/lib/Target/AIE/CMakeLists.txt | 2 + .../AIE2PPostLegalizerCustomCombiner.cpp | 15 +- .../Target/AIE/aie2p/AIE2PTargetMachine.cpp | 3 + 14 files changed, 743 insertions(+), 83 deletions(-) create mode 100644 llvm/lib/Target/AIE/AIEGlobalCombiner.cpp create mode 100644 llvm/lib/Target/AIE/AIEGlobalCombiner.h create mode 100644 llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp create mode 100644 llvm/lib/Target/AIE/AIEPtrModOptimizer.h diff --git a/llvm/lib/Target/AIE/AIE.h b/llvm/lib/Target/AIE/AIE.h index 412c631b1cae..a1cf8477880d 100644 --- a/llvm/lib/Target/AIE/AIE.h +++ b/llvm/lib/Target/AIE/AIE.h @@ -54,6 +54,7 @@ FunctionPass *createAIEBaseHardwareLoopsPass(); FunctionPass *createAIEPseudoBranchExpansion(); FunctionPass *createAIESubRegConstrainer(); MachineFunctionPass *createAIEClusterBaseAddress(); +MachineFunctionPass *createAIEPtrModOptimizer(); MachineFunctionPass *createAIEAddressSpaceFlattening(); MachineFunctionPass *createAIEEliminateDuplicatePHI(); FunctionPass *createAIEOutlineMemoryGEP(); @@ -65,6 +66,7 @@ createDeadMachineInstructionElim(bool KeepLifetimeInstructions); void initializeAIEBaseHardwareLoopsPass(PassRegistry &); void initializeAIEClusterBaseAddressPass(PassRegistry &); +void initializeAIEPtrModOptimizerPass(PassRegistry &); void initializeAIEAddressSpaceFlatteningPass(PassRegistry &); void initializeAIEEliminateDuplicatePHIPass(PassRegistry &); extern char &AIEFormatSelectorID; diff --git a/llvm/lib/Target/AIE/AIE2PostLegalizerCustomCombiner.cpp b/llvm/lib/Target/AIE/AIE2PostLegalizerCustomCombiner.cpp index 886c41789186..e8666b41cd2c 100644 --- a/llvm/lib/Target/AIE/AIE2PostLegalizerCustomCombiner.cpp +++ b/llvm/lib/Target/AIE/AIE2PostLegalizerCustomCombiner.cpp @@ -15,6 +15,7 @@ #include "AIE2TargetMachine.h" #include "AIECombinerHelper.h" +#include "AIEPtrModOptimizer.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" @@ -45,6 +46,8 @@ class AIE2PostLegalizerCustomCombinerImpl : public Combiner { protected: // TODO: Make CombinerHelper methods const. mutable CombinerHelper Helper; + AIE::FoundCombiners EmptyGlobalCombiner; + AIE::FoundCombiners *GlobalCombiners = nullptr; const AIE2PostLegalizerCustomCombinerImplRuleConfig &RuleConfig; const AIE2Subtarget &STI; @@ -52,6 +55,7 @@ class AIE2PostLegalizerCustomCombinerImpl : public Combiner { AIE2PostLegalizerCustomCombinerImpl( MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, GISelKnownBits &KB, GISelCSEInfo *CSEInfo, + AIE::FoundCombiners *GlobalCombiner, const AIE2PostLegalizerCustomCombinerImplRuleConfig &RuleConfig, const AIE2Subtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI); @@ -73,17 +77,19 @@ class AIE2PostLegalizerCustomCombinerImpl : public Combiner { AIE2PostLegalizerCustomCombinerImpl::AIE2PostLegalizerCustomCombinerImpl( MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, GISelKnownBits &KB, GISelCSEInfo *CSEInfo, + AIE::FoundCombiners *GlobalCombiner, const AIE2PostLegalizerCustomCombinerImplRuleConfig &RuleConfig, - const AIE2Subtarget &STI, - MachineDominatorTree *MDT, + const AIE2Subtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) : Combiner(MF, CInfo, TPC, &KB, CSEInfo), Helper(Observer, B, /*IsPostLegalize*/ false, &KB, MDT, LI), - RuleConfig(RuleConfig), STI(STI), + GlobalCombiners(GlobalCombiner), RuleConfig(RuleConfig), STI(STI), #define GET_GICOMBINER_CONSTRUCTOR_INITS #include "AIE2GenPostLegalizerGICustomCombiner.inc" #undef GET_GICOMBINER_CONSTRUCTOR_INITS { + if (!GlobalCombiner) + GlobalCombiners = &EmptyGlobalCombiner; } class AIE2PostLegalizerCustomCombiner : public MachineFunctionPass { @@ -107,6 +113,7 @@ class AIE2PostLegalizerCustomCombiner : public MachineFunctionPass { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -145,11 +152,16 @@ bool AIE2PostLegalizerCustomCombiner::runOnMachineFunction( GISelKnownBits *KB = &getAnalysis().get(MF); MachineDominatorTree *MDT = &getAnalysis(); + AIE::FoundCombiners *AIEGlobalPtrIncResults = nullptr; + if (auto *PtrModOptPass = getAnalysisIfAvailable()) + AIEGlobalPtrIncResults = PtrModOptPass->getGlobalPtrCombiners(); + CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); AIE2PostLegalizerCustomCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, - RuleConfig, ST, MDT, LI); + AIEGlobalPtrIncResults, RuleConfig, + ST, MDT, LI); return Impl.combineMachineInstrs(); } diff --git a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp index 9d98352b33e6..6f837e84887b 100644 --- a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp @@ -41,6 +41,7 @@ static cl::opt EnableOutlineMemoryGEP( cl::desc("Enable Outlining GEPs in Memory Instructions.")); extern cl::opt EnableAddressChaining; +extern cl::opt EnableGlobalPtrModOptimizer; extern cl::opt EnableStagedRA; extern cl::opt EnableSuperRegSplitting; extern cl::opt AllocateMRegsFirst; @@ -89,6 +90,8 @@ void AIE2PassConfig::addPreRegBankSelect() { addPass(createAIE2PostLegalizerGenericCombiner()); if (EnableAddressChaining) addPass(createAIEClusterBaseAddress()); + if (EnableGlobalPtrModOptimizer) + addPass(createAIEPtrModOptimizer()); addPass(createAIE2PostLegalizerCustomCombiner()); } } diff --git a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp index dbd15a05eeb0..1fddc94a4bf0 100644 --- a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp @@ -73,6 +73,10 @@ cl::opt EnableAddressChaining("aie-address-chaining", cl::Hidden, cl::init(true), cl::desc("Enable ptradd chaining.")); +cl::opt EnableGlobalPtrModOptimizer( + "aie-global-ptr-mod-opt", cl::Hidden, cl::init(false), + cl::desc("Enable global pointer modifier optimization.")); + cl::opt EnableStagedRA("aie-staged-ra", cl::Hidden, cl::init(true), cl::desc("Enable multi-stage register allocation")); diff --git a/llvm/lib/Target/AIE/AIECombine.td b/llvm/lib/Target/AIE/AIECombine.td index 8e25e35f2fde..cc672a53a23b 100644 --- a/llvm/lib/Target/AIE/AIECombine.td +++ b/llvm/lib/Target/AIE/AIECombine.td @@ -237,12 +237,11 @@ def combine_upd_to_concat : GICombineRule< (apply [{ applyUpdToConcat(*${root}, MRI, B, ${matchinfo}); }]) >; -def load_store_increment_matchdata : GIDefMatchData<"AIELoadStoreCombineMatchData">; def combine_load_store_increment : GICombineRule < - (defs root:$root, load_store_increment_matchdata:$matchinfo), + (defs root:$root), (match (wip_match_opcode G_LOAD, G_ZEXTLOAD, G_SEXTLOAD, G_STORE):$root, - [{ return matchLdStInc(*${root}, MRI, ${matchinfo}, Helper, B.getTII()); }]), - (apply [{ applyLdStInc(*${root}, MRI, B, ${matchinfo}, Observer); }] ) + [{ return matchLdStInc(*${root}, MRI, Helper, B.getTII(), GlobalCombiners); }]), + (apply [{ applyLdStInc(*${root}, MRI, Helper, B, Observer, GlobalCombiners); }] ) >; def combine_add_vector_elt_undef : GICombineRule < diff --git a/llvm/lib/Target/AIE/AIECombinerHelper.cpp b/llvm/lib/Target/AIE/AIECombinerHelper.cpp index 98e997215598..2adbfa348c32 100644 --- a/llvm/lib/Target/AIE/AIECombinerHelper.cpp +++ b/llvm/lib/Target/AIE/AIECombinerHelper.cpp @@ -202,10 +202,11 @@ MaskMatch::getFrequentIndexResult(const ArrayRef Mask, return FrequentIndexResult{FrequentIdx, NonMatchingCount}; } -MachineInstr *findPreIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI, - CombinerHelper &Helper, - AIELoadStoreCombineMatchData &MatchData, - const AIEBaseInstrInfo &TII) { +static MachineInstr *findPreIncMatch(MachineInstr &MemI, + MachineRegisterInfo &MRI, + CombinerHelper &Helper, + const AIEBaseInstrInfo &TII, + AIE::FoundCombiners *GlobalCombinerPtr) { // This is currently done with patterns in instruction selection. // No need to do it here. const unsigned VecSize = @@ -219,9 +220,14 @@ MachineInstr *findPreIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI, Register Addr = MemI.getOperand(1).getReg(); MachineInstr *AddrDef = getDefIgnoringCopies(Addr, MRI); if (AddrDef->getOpcode() == TargetOpcode::G_PTR_ADD) { - MatchData = {AddrDef, TII.getOffsetMemOpcode(MemI.getOpcode()), &MemI, - /*ExtraInstrsToMove=*/{}, - /*RemoveInstr=*/false}; + // 2 Instructions are in the Combiner + BitVector RemoveInstrs(2); + GlobalCombinerPtr->append(AIE::Combiner( + /*CombineInstrs=*/std::vector{AddrDef, &MemI}, + /*CombinedInstrOpcode=*/TII.getOffsetMemOpcode(MemI.getOpcode()), + /*InsertionPoint=*/&MemI, /*CombineRoot=*/&MemI, + /*MoveUpInstrsToInsertionPoint=*/std::vector{}, + /*RemoveInstrs=*/RemoveInstrs, /*Name=*/"Offset-legacy")); return AddrDef; } return nullptr; @@ -269,7 +275,7 @@ bool isNonCoalesceableUseOf(const MachineInstr &MemI, /// \return true if \a MemI can be moved just before \a Dest in order to allow /// post-increment combining bool llvm::canDelayMemOp(MachineInstr &MemI, MachineInstr &Dest, - MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI) { if (MemI.getParent() != Dest.getParent()) return false; auto MII = std::next(MemI.getIterator()); @@ -433,9 +439,10 @@ findEarliestInsertPoint(MachineInstr &Instr, MachineInstr &NoMoveBeforeInstr, return EarliestInstrPos; } -std::vector +static std::vector findConstantOffsetsToMove(MachineInstr &PtrAdd, MachineInstr &PtrAddInsertLoc, - MachineRegisterInfo &MRI, CombinerHelper &Helper) { + const MachineRegisterInfo &MRI, + CombinerHelper &Helper) { // By moving the PtrAdd up without considering if we are moving past a // G_CONSTANT defining one of the uses of the PtrAdd we are generating // incorrect code (use before def). We have to search those G_CONSTANTs and @@ -455,8 +462,8 @@ findConstantOffsetsToMove(MachineInstr &PtrAdd, MachineInstr &PtrAddInsertLoc, } // Check that MI is after First and not after Last -bool isBetween(MachineInstr &MI, MachineInstr &First, MachineInstr &Last, - CombinerHelper &Helper) { +static bool isBetween(MachineInstr &MI, MachineInstr &First, MachineInstr &Last, + CombinerHelper &Helper) { assert(First.getParent() == Last.getParent()); // If it's in another block, it can't be between if (MI.getParent() != First.getParent()) { @@ -468,10 +475,11 @@ bool isBetween(MachineInstr &MI, MachineInstr &First, MachineInstr &Last, return !Helper.dominates(MI, First) && Helper.dominates(MI, Last); } -MachineInstr *findPostIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI, - CombinerHelper &Helper, - AIELoadStoreCombineMatchData &MatchData, - const AIEBaseInstrInfo &TII) { +static MachineInstr *findPostIncMatch(MachineInstr &MemI, + MachineRegisterInfo &MRI, + CombinerHelper &Helper, + const AIEBaseInstrInfo &TII, + AIE::FoundCombiners *GlobalCombinerPtr) { if (!EnablePostIncCombine) return nullptr; @@ -480,7 +488,14 @@ MachineInstr *findPostIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI, if (VecSize > TII.getMaxSupportedLdStIncSize()) { return nullptr; } + // 2 Instructions are in the Combiner + BitVector RemovePtrInc(2); + // remove PtrInc + RemovePtrInc.set(0); + + MachineInstr *InsertionPoint = nullptr; Register Addr = MemI.getOperand(1).getReg(); + AIE::Combiner TempCombiner; for (auto &PtrInc : MRI.use_nodbg_instructions(Addr)) { if (MemI.getParent() != PtrInc.getParent()) continue; @@ -503,9 +518,14 @@ MachineInstr *findPostIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI, })) { continue; } - MatchData = {&PtrInc, *CombinedOpcode, &MemI, - /*ExtraInstrsToMove=*/{}, - /*RemoveInstr=*/true}; + InsertionPoint = &MemI; + TempCombiner = AIE::Combiner( + /*CombineInstrs=*/std::vector{&PtrInc, &MemI}, + /*CombinedInstrOpcode=*/*CombinedOpcode, + /*InsertionPoint=*/InsertionPoint, /*CombineRoot=*/&MemI, + /*MoveUpInstrsToInsertionPoint=*/std::vector{}, + /*RemoveInstrs=*/RemovePtrInc, /*Name=*/"PostInc1"); + // The offset of the PtrInc might be defined after MemI, in this case we // want to verify if it would be possible to insert the combined // instruction at the PtrInc instead of the location of MemI. Instruction @@ -515,11 +535,14 @@ MachineInstr *findPostIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI, } else if (canDelayMemOp(MemI, PtrAddInsertLoc, MRI)) { // If Definition of the offset is a G_CONSTANT we have to move that // instruction up - MatchData = { - &PtrInc, *CombinedOpcode, &PtrAddInsertLoc, - /*ExtraInstrsToMove=*/ + InsertionPoint = &PtrAddInsertLoc; + TempCombiner = AIE::Combiner( + /*CombineInstrs=*/std::vector{&PtrInc, &MemI}, + /*CombinedInstrOpcode=*/*CombinedOpcode, + /*InsertionPoint=*/InsertionPoint, /*CombineRoot=*/&MemI, + /*MoveUpInstrsToInsertionPoint=*/ findConstantOffsetsToMove(PtrInc, PtrAddInsertLoc, MRI, Helper), - /*RemoveInstr=*/true}; + /*RemoveInstrs=*/RemovePtrInc, /*Name=*/"PostInc2"); } else { LLVM_DEBUG(dbgs() << " Ignoring candidate " << PtrInc); continue; @@ -530,56 +553,141 @@ MachineInstr *findPostIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI, // TODO: This heuristic is very conservative and we should allow combines if // a combine does not dominate the insertion point but can never follow the // insertion point, e.g. being in a sibling BB. - bool AddrUsesDominatesInsertPoint = checkRegUsesDominate( - Addr, *MatchData.CombinedInsertPoint, PtrInc, MRI, Helper, TII); - if (EnableGreedyAddressCombine || AddrUsesDominatesInsertPoint) + bool AddrUsesDominatesInsertPoint = + checkRegUsesDominate(Addr, *InsertionPoint, PtrInc, MRI, Helper, TII); + if (EnableGreedyAddressCombine || AddrUsesDominatesInsertPoint) { + GlobalCombinerPtr->append(TempCombiner); return &PtrInc; + } } return nullptr; } +bool llvm::matchGlobalPtrModOptimizer(MachineInstr &MemI, + MachineRegisterInfo &MRI, + CombinerHelper &Helper, + const TargetInstrInfo &TII, + AIE::FoundCombiners *GlobalCombinerPtr) { + + AIE::Combiner *CombineRule = GlobalCombinerPtr->getCombine(&MemI); + if (!CombineRule) { + LLVM_DEBUG(dbgs() << "[Global Ptr Inc] Could not find Combine for " + << MemI); + return false; + } + assert(CombineRule->CombineInstrs.size() >= 2); + LLVM_DEBUG(dbgs() << "[Global Ptr Inc] Found\n" << *CombineRule); + + return true; +} + bool llvm::matchLdStInc(MachineInstr &MemI, MachineRegisterInfo &MRI, - AIELoadStoreCombineMatchData &MatchData, - CombinerHelper &Helper, const TargetInstrInfo &TII) { + CombinerHelper &Helper, const TargetInstrInfo &TII, + AIE::FoundCombiners *GlobalCombinerPtr) { const AIEBaseInstrInfo &AIETII = (const AIEBaseInstrInfo &)TII; - return findPostIncMatch(MemI, MRI, Helper, MatchData, AIETII) || - findPreIncMatch(MemI, MRI, Helper, MatchData, AIETII); + + if (GlobalCombinerPtr->hasAnalysis()) + return false; + + return findPostIncMatch(MemI, MRI, Helper, AIETII, GlobalCombinerPtr) || + findPreIncMatch(MemI, MRI, Helper, AIETII, GlobalCombinerPtr); } -void llvm::applyLdStInc(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B, - AIELoadStoreCombineMatchData &MatchData, - GISelChangeObserver &Observer) { - if (MatchData.CombinedInsertPoint) { - B.setInstr(*MatchData.CombinedInsertPoint); +void llvm::applyLdStInc(MachineInstr &MemI, MachineRegisterInfo &MRI, + CombinerHelper &Helper, MachineIRBuilder &B, + GISelChangeObserver &Observer, + AIE::FoundCombiners *GlobalCombinerPtr) { + + AIE::Combiner *CombineResult = GlobalCombinerPtr->getCombine(&MemI); + assert(CombineResult); + + LLVM_DEBUG(dbgs() << "Applying Combiner "; CombineResult->dumpFull()); + + MachineInstr *CombinedInsertionPoint = CombineResult->InsertionPoint; + unsigned CombinedInstrOpcode = CombineResult->CombinedInstrOpcode; + assert(CombinedInstrOpcode != (unsigned)-1 && "Invalid OpCode"); + + if (CombinedInsertionPoint) { + B.setInstr(*CombinedInsertionPoint); } else { - B.setMBB(*MI.getParent()); + B.setMBB(*MemI.getParent()); } + + // Init combiner and get variables + MachineInstr *PtrMod = CombineResult->CombineInstrs[0]; + bool RemovePtrMod = CombineResult->RemoveInstrs.any(); + // Debug Loc: Debug Loc of LOAD STORE: MI - B.setDebugLoc(MI.getDebugLoc()); - auto NewInstr = B.buildInstr(MatchData.CombinedInstrOpcode); - for (auto *Instr : MatchData.ExtraInstrsToMove) { + B.setDebugLoc(MemI.getDebugLoc()); + auto NewInstr = B.buildInstr(CombinedInstrOpcode); + + // move Instr right before the InsertionPoint + for (auto *Instr : CombineResult->MoveUpInstrsToInsertionPoint) { + if (!Instr->getParent()) + // Instr does not exist anymore, no need to move it + continue; + + if (Helper.dominates(*Instr, *NewInstr)) + continue; + Instr->moveBefore(NewInstr); + LLVM_DEBUG(dbgs() << "Move Instr before " << *Instr); + } + + // Move Instr past the InsertionPoint + if (CombinedInsertionPoint) { + for (auto *Instr : CombineResult->DelayInstrPastInsertionPoint) { + if (!Instr->getParent()) + // Instruction may not exist anymore, i.e. a ptr_add that was combined + // to a post increment Instruction + continue; + + if (Helper.dominates(*CombinedInsertionPoint, *Instr)) + continue; + + LLVM_DEBUG(dbgs() << "Delaying Instr " << *Instr); + Instr->moveBefore(CombinedInsertionPoint); + } } - if (MI.mayLoad()) - NewInstr.addDef(MI.getOperand(0).getReg() /* Loaded value */); - if (MatchData.RemoveInstr) + + if (MemI.mayLoad()) + NewInstr.addDef(MemI.getOperand(0).getReg() /* Loaded value */); + if (RemovePtrMod) // If we remove the instr it is because we have defs that would otherwise // be redefined. We have to add these defs into the new instruction. - for (auto Def : MatchData.Instr->defs()) + for (auto Def : PtrMod->defs()) if (Def.isReg()) NewInstr.addDef(Def.getReg()); - if (MI.getOpcode() == TargetOpcode::G_STORE) - NewInstr.addUse(MI.getOperand(0).getReg() /* Stored value */); - for (auto Use : MatchData.Instr->uses()) + if (MemI.getOpcode() == TargetOpcode::G_STORE) + NewInstr.addUse(MemI.getOperand(0).getReg() /* Stored value */); + for (auto Use : PtrMod->uses()) if (Use.isReg()) NewInstr.addUse(Use.getReg()); - for (auto *Mem : MI.memoperands()) + for (auto *Mem : MemI.memoperands()) NewInstr.addMemOperand(Mem); - if (MatchData.RemoveInstr) - MatchData.Instr->removeFromParent(); - MI.removeFromParent(); + // keep track of Converted Instructions, so that delayInstructions are + // properly keep track of + GlobalCombinerPtr->createMapping(&MemI, NewInstr); + + LLVM_DEBUG(dbgs() << *NewInstr.getInstr()); + + for (int Idx = CombineResult->RemoveInstrs.find_first(); Idx != -1; + Idx = CombineResult->RemoveInstrs.find_next(Idx)) { + auto *RemoveMI = CombineResult->CombineInstrs[Idx]; + + // Removed Instructions have to be remapped to the newly Inserted + // Instructions, so that they are considered when the Removed Instruction + // should be moved up/down + GlobalCombinerPtr->createMapping(RemoveMI, NewInstr); + + LLVM_DEBUG(dbgs() << " Removing " << *RemoveMI); + assert(RemoveMI->getParent() && + "RemoveMI was already deleted. This Combiner may have a conflict " + "with the Combiner that already removed the MachineInstr."); + RemoveMI->removeFromParent(); + } + MemI.removeFromParent(); } // Match all equivalents of these: @@ -3298,3 +3406,7 @@ bool llvm::matchBroadcastToShl(MachineInstr &MI, MachineRegisterInfo &MRI, return true; } + +void llvm::foundPattern(MachineInstr &MemI) { + dbgs() << "Found Custom Pattern " << MemI; +} diff --git a/llvm/lib/Target/AIE/AIECombinerHelper.h b/llvm/lib/Target/AIE/AIECombinerHelper.h index 2280d27de4d5..d57ab96a731b 100644 --- a/llvm/lib/Target/AIE/AIECombinerHelper.h +++ b/llvm/lib/Target/AIE/AIECombinerHelper.h @@ -11,28 +11,14 @@ #ifndef LLVM_LIB_TARGET_AIE_AIECOMBINERHELPER_H #define LLVM_LIB_TARGET_AIE_AIECOMBINERHELPER_H +#include "AIEPtrModOptimizer.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineInstr.h" namespace llvm { struct AIEBaseInstrInfo; -struct AIELoadStoreCombineMatchData { - /// Matched PtrAdd instruction - MachineInstr *Instr; - /// Opcode of the combined instruction - unsigned CombinedInstrOpcode; - /// If null insert instruction at the end of the BB, otherwise insert just - /// before this Instruction - MachineInstr *CombinedInsertPoint; - /// Additional instructions to be moved just before Instr - std::vector ExtraInstrsToMove; - /// Should Instr (the PtrAdd) be removed after the combine was applied - bool RemoveInstr; -}; - struct ShuffleMaskValidity { bool IsValid; // Holds mask indices that don't satisfy the mask constraints @@ -89,16 +75,24 @@ struct AIESingleDiffLaneBuildVectorMatchData { unsigned DifferingIndex; }; +void foundPattern(MachineInstr &MemI); + +bool matchGlobalPtrModOptimizer(MachineInstr &MemI, MachineRegisterInfo &MRI, + CombinerHelper &Helper, + const TargetInstrInfo &TII, + AIE::FoundCombiners *GlobalCombinerPtr); + /// Look for any PtrAdd instruction that use the same base as \a MI that can be -/// combined with it and stores it in \a MatchData +/// combined with it and stores it in \a GlobalCombinerPtr /// \return true if an instruction is found bool matchLdStInc(MachineInstr &MI, MachineRegisterInfo &MRI, - AIELoadStoreCombineMatchData &MatchData, - CombinerHelper &Helper, const TargetInstrInfo &TII); -/// Combines \a MI and the instruction stored in \a MatchData + CombinerHelper &Helper, const TargetInstrInfo &TII, + AIE::FoundCombiners *GlobalCombinerPtr); +/// Combines \a MI and the instruction stored in \a GlobalCombinerPtr void applyLdStInc(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B, AIELoadStoreCombineMatchData &MatchData, - GISelChangeObserver &Observer); + CombinerHelper &Helper, MachineIRBuilder &B, + GISelChangeObserver &Observer, + AIE::FoundCombiners *GlobalCombinerPtr); /// Look for with G_IMPLICIT_DEF source operands /// \return true if such an instruction is found bool matchAddVecEltUndef(MachineInstr &MI, MachineRegisterInfo &MRI, @@ -131,7 +125,7 @@ bool matchShuffleToExtractBroadcast(MachineInstr &MI, MachineRegisterInfo &MRI, /// \return true if \a MemI can be moved just before \a Dest in order to allow /// post-increment combining bool canDelayMemOp(MachineInstr &MemI, MachineInstr &Dest, - MachineRegisterInfo &MRI); + const MachineRegisterInfo &MRI); /// \return true if \a Dest can be moved just after \a MemI in order to allow /// combining bool canAdvanceOp(MachineInstr &MemI, MachineInstr &Dest, diff --git a/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp b/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp new file mode 100644 index 000000000000..5043d02df1b1 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp @@ -0,0 +1,147 @@ +//===--- AIEGlobalCombiner.cpp - Global Combiner Helper Interface ---------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// Implements the generic algorithmic parts of the global combiner search. +// +//===----------------------------------------------------------------------===// + +#include "AIEGlobalCombiner.h" + +namespace llvm::AIE { + +// -------------------------- CombinerGain -----------------------------------// + +CombinerGain::CombinerGain(std::initializer_list InitialGain) { + assert(InitialGain.size() <= GainVector.size()); + reset(); + + std::copy(InitialGain.begin(), InitialGain.end(), GainVector.begin()); +} + +void CombinerGain::reset() { + std::fill(GainVector.begin(), GainVector.end(), 0); +} + +bool CombinerGain::operator>(const CombinerGain &Rhs) const { + for (unsigned Idx = 0; Idx < GainVector.size(); Idx++) { + if (GainVector[Idx] == Rhs.GainVector[Idx]) + continue; + + return GainVector[Idx] > Rhs.GainVector[Idx]; + } + return false; +} + +bool CombinerGain::operator<(const CombinerGain &Rhs) const { + assert(Rhs.GainVector.size() == GainVector.size()); + return GainVector < Rhs.GainVector; +} + +bool CombinerGain::operator==(const CombinerGain &Rhs) const { + assert(Rhs.GainVector.size() == GainVector.size()); + return GainVector == Rhs.GainVector; +} + +bool CombinerGain::operator!=(const CombinerGain &Rhs) const { + assert(Rhs.GainVector.size() == GainVector.size()); + return !(*this == Rhs); +} + +CombinerGain &CombinerGain::operator+=(const CombinerGain &Rhs) { + for (unsigned Idx = 0; Idx < GainVector.size(); Idx++) { + GainVector[Idx] += Rhs.GainVector[Idx]; + } + return *this; +} + +CombinerGain CombinerGain::operator+(const CombinerGain &Rhs) const { + CombinerGain Result(*this); + return Result += Rhs; +} + +CombinerGain &CombinerGain::operator-=(const CombinerGain &Rhs) { + for (unsigned Idx = 0; Idx < GainVector.size(); Idx++) { + GainVector[Idx] -= Rhs.GainVector[Idx]; + } + return *this; +} + +CombinerGain CombinerGain::operator-(const CombinerGain &Rhs) const { + CombinerGain Result(*this); + return Result -= Rhs; +} + +// --------------------------- Combiner --------------------------------------// + +Combiner::Combiner(std::vector CombineInstrs, + unsigned CombinedInstrOpcode, MachineInstr *InsertionPoint, + MachineInstr *CombineRoot, + std::vector MoveUpInstrsToCombineRoot, + BitVector RemoveInstrs, StringRef Name) + : CombineRoot(CombineRoot), InsertionPoint(InsertionPoint), + CombineInstrs(CombineInstrs), + MoveUpInstrsToInsertionPoint(MoveUpInstrsToCombineRoot), + CombinedInstrOpcode(CombinedInstrOpcode), RemoveInstrs(RemoveInstrs), + Name(Name) {} + +void Combiner::dumpFull() const { dumpFull(nullptr, nullptr); } + +void Combiner::dumpFull(unsigned *GlobalID, CombinerGain *Gain) const { + if (!GlobalID || !Gain) + dbgs() << "{Combiner " << Name << "\n"; + else + dbgs() << "{Combiner " << Name << " [" << *GlobalID << "]; Gain = " << *Gain + << "\n"; + + for (auto *MI : CombineInstrs) { + dbgs() << " " << *MI; + } + dbgs() << " Insertion Point: "; + dbgs() << " " << *InsertionPoint; + + if (!MoveUpInstrsToInsertionPoint.empty()) { + dbgs() << " to Move Up: \n"; + for (const auto *MI : MoveUpInstrsToInsertionPoint) + dbgs() << " " << *MI; + } + if (!DelayInstrToInsertionPoint.empty()) { + dbgs() << " Move to Insertion: \n"; + for (const auto *MI : DelayInstrToInsertionPoint) + dbgs() << " " << *MI; + } + if (!DelayInstrPastInsertionPoint.empty()) { + dbgs() << " Delay past Insertion: \n"; + for (const auto *MI : DelayInstrPastInsertionPoint) + dbgs() << " " << *MI; + } + dbgs() << "}\n"; +} + +// ---------------------------------------------------------------------------// + +raw_ostream &operator<<(raw_ostream &OS, const CombinerGain &Val) { + OS << "["; + for (auto &Gain : Val.GainVector) { + OS << Gain << ","; + } + OS << "]"; + return OS; +} + +raw_ostream &operator<<(raw_ostream &OS, const Combiner &Val) { + OS << Val.Name << " ("; + for (auto *MI : Val.CombineInstrs) { + OS << MI->getOperand(0) << ","; + } + OS << ") "; + return OS; +} + +}; // namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIEGlobalCombiner.h b/llvm/lib/Target/AIE/AIEGlobalCombiner.h new file mode 100644 index 000000000000..5453c9313a21 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEGlobalCombiner.h @@ -0,0 +1,111 @@ +//===--- AIEGlobalCombiner.h - Global Combiner Helper Interface -----------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// This is a generic global (Machine Basic Block-level) Combiner that searches +// through multiple combiners to provide a set of combiners that maximize the +// gain of the individual combiners. +// +// The interface is AIEGlobalCombiner, which consists of a setup-function to +// setup the Machine Basic Block, a generateCombiners-function that generates +// all possible combiners and the findBeneficialCombiners function that searches +// for a profitable set of combiners. +// +// A Combiner (CombineResult) consists of a Root, which is a MachineInstruction +// that is replaced by a new Instruction of the Combiner. Additionally, multiple +// Combiners can be clustered together by the Registers that are common between +// Combiners. Combiners with common clustered Registers may conflict with each +// other, meaning that they cannot be applied together. +// +// The search for a set of beneficial combiners is divided into two parts, a +// greedy heuristic and the search. The search is inspired by A*. The goal of +// the search is to maximize the gain of the Combiners. +// + +#ifndef LLVM_LIB_TARGET_AIE_AIEGLOBALCOMBINER_H +#define LLVM_LIB_TARGET_AIE_AIEGLOBALCOMBINER_H + +#include "AIEBaseInstrInfo.h" +#include "AIEInterBlockScheduling.h" + +namespace llvm::AIE { + +class CombinerGain { +public: + // Hierarchical gain vector. Larger Indices are less important in the Gain + // calculation + std::array GainVector; + + CombinerGain() : CombinerGain({}) {} + CombinerGain(std::initializer_list InitialGain); + CombinerGain(const CombinerGain &Other) = default; + + virtual ~CombinerGain() = default; + + /// reset the GainVector + void reset(); + + bool operator>(const CombinerGain &Rhs) const; + bool operator<=(const CombinerGain &Rhs) const; + bool operator<(const CombinerGain &Rhs) const; + bool operator==(const CombinerGain &Rhs) const; + bool operator!=(const CombinerGain &Rhs) const; + + CombinerGain &operator+=(const CombinerGain &Rhs); + CombinerGain operator+(const CombinerGain &Rhs) const; + CombinerGain &operator-=(const CombinerGain &Rhs); + CombinerGain operator-(const CombinerGain &Rhs) const; +}; + +raw_ostream &operator<<(raw_ostream &OS, const CombinerGain &Val); + +class Combiner { +public: + /// Root of the Combiner + MachineInstr *CombineRoot = nullptr; + /// Keep track of the MachineInstrs used by the Combiner + /// If null insert instructions at the end of + /// the BB, otherwise insert just before this + /// Instruction + MachineInstr *InsertionPoint = nullptr; + std::vector CombineInstrs; + /// Instructions to be moved up just before InsertionPoint + std::vector MoveUpInstrsToInsertionPoint; + /// Instructions to move down to InsertionPoint + std::vector DelayInstrToInsertionPoint; + /// Instruction to move below InsertionPoint + std::vector DelayInstrPastInsertionPoint; + /// Opcode of the combined instruction + unsigned CombinedInstrOpcode = -1; + /// Subset of CombineInstructions to be removed + BitVector RemoveInstrs; + StringRef Name; + + Combiner() = default; + Combiner(std::vector CombineInstrs, + unsigned CombinedInstrOpcode, MachineInstr *InsertionPoint, + MachineInstr *CombineRoot, + std::vector MoveUpInstrsToInsertionPoint, + BitVector RemoveInstrs, StringRef Name); + + /// Dump all the relevant information of the Combiner. + void dumpFull() const; + void dumpFull(unsigned *GlobalID, CombinerGain *Gain) const; +}; + +raw_ostream &operator<<(raw_ostream &OS, const Combiner &Val); + +class GenericCombiner { + +public: + Combiner CombinerData; +}; + +} // namespace llvm::AIE + +#endif // LLVM_LIB_TARGET_AIE_AIEGLOBALCOMBINER_H diff --git a/llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp b/llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp new file mode 100644 index 000000000000..dbd245a216f9 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp @@ -0,0 +1,160 @@ +//===--- AIEPtrModOptimizer.cpp - Optimizer ptr mod operations ------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// Implements the llvm-pass AIEPtrModOptimizer to query Combiners +// (FoundCombiners). +// +//===----------------------------------------------------------------------===// + +#include "AIEPtrModOptimizer.h" +#include "AIE.h" +#include "AIEBaseInstrInfo.h" +#include "AIEGlobalCombiner.h" +#include "AIEInterBlockScheduling.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/GlobalISel/CSEInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/InitializePasses.h" +#include + +#define DEBUG_TYPE "aie-ptr-mod-opt" + +extern cl::opt EnableGlobalPtrModOptimizer; + +using namespace llvm; + +static const char AIE_PTR_MOD_OPTIMIZER[] = "AIE Pointer Modifier Optimization"; + +StringRef AIEPtrModOptimizer::getPassName() const { + return AIE_PTR_MOD_OPTIMIZER; +} +namespace llvm { + +bool AIEPtrModOptimizer::runOnMachineFunction(MachineFunction &MF) { + PtrModRes = std::make_unique( + /*Analysis=*/false); + + return false; +} + +void AIEPtrModOptimizer::appendResult( + std::vector &Combiners) { + + for (auto *Combine : Combiners) { + PtrModRes->append(Combine->CombinerData); + } +} + +void AIEPtrModOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.setPreservesAll(); +} + +} // namespace llvm + +namespace llvm::AIE { +void FoundCombiners::append(const AIE::Combiner &CombineResult) { + assert(CombineResult.CombineRoot); + InstrCombines[CombineResult.CombineRoot] = CombineResult; + LLVM_DEBUG(dbgs() << "[Solution] "; CombineResult.dumpFull()); +} + +AIE::Combiner *FoundCombiners::getCombine(MachineInstr *CombineRoot) { + auto It = find_if( + InstrCombines, + [&](const std::pair &CandidatePair) { + const MachineInstr *CandidateI = CandidatePair.first; + return CandidateI == CombineRoot; + }); + + if (It == InstrCombines.end()) + return nullptr; + + // remap instructions + AIE::Combiner &Combiner = It->second; + remapCombiner(Combiner); + return &Combiner; +} + +void FoundCombiners::createMapping(MachineInstr *ReplacedMI, + MachineInstr *NewlyInsertedMI) { + assert(NewlyInsertedMI != nullptr); + assert(ReplacedMI != nullptr); + ConvertedInstrs[ReplacedMI] = NewlyInsertedMI; +} + +MachineInstr *FoundCombiners::getMappedInstr(MachineInstr *ReplacedMI) const { + auto It = ConvertedInstrs.find(ReplacedMI); + if (It == ConvertedInstrs.end()) + return nullptr; + + return It->second; +} + +std::vector +FoundCombiners::getRemappedInstrs(std::vector &MIs) const { + std::vector FinalInstrs; + for (auto *MI : MIs) { + if (auto *Converted = getMappedInstr(MI)) + FinalInstrs.push_back(Converted); + else + FinalInstrs.push_back(MI); + } + return FinalInstrs; +} + +void FoundCombiners::remapCombiner(AIE::Combiner &Combiner) const { + Combiner.MoveUpInstrsToInsertionPoint = + getRemappedInstrs(Combiner.MoveUpInstrsToInsertionPoint); + Combiner.DelayInstrToInsertionPoint = + getRemappedInstrs(Combiner.DelayInstrToInsertionPoint); + Combiner.DelayInstrPastInsertionPoint = + getRemappedInstrs(Combiner.DelayInstrPastInsertionPoint); +} + +const std::map & +FoundCombiners::getInstrCombines() const { + return InstrCombines; +} + +} // namespace llvm::AIE + +raw_ostream &llvm::operator<<(raw_ostream &OS, const AIE::FoundCombiners &Val) { + OS << "{ "; + for (auto &MemPtr : Val.getInstrCombines()) { + OS << *MemPtr.first; + } + OS << " }"; + return OS; +} + +char AIEPtrModOptimizer::ID = 0; +INITIALIZE_PASS_BEGIN(AIEPtrModOptimizer, DEBUG_TYPE, AIE_PTR_MOD_OPTIMIZER, + false, false) +INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AIEPtrModOptimizer, DEBUG_TYPE, AIE_PTR_MOD_OPTIMIZER, + false, false) + +MachineFunctionPass *llvm::createAIEPtrModOptimizer() { + return new AIEPtrModOptimizer(); +} diff --git a/llvm/lib/Target/AIE/AIEPtrModOptimizer.h b/llvm/lib/Target/AIE/AIEPtrModOptimizer.h new file mode 100644 index 000000000000..3dae93056cf6 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEPtrModOptimizer.h @@ -0,0 +1,100 @@ +//===--- AIEPtrModOptimizer.h - Optimizer ptr mod operations +//-------------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// Defines a llvm-pass (AIEPtrModOptimizer) to query Combiners +// (FoundCombiners). +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIEPTRMODOPTIMIZERPASS_H +#define LLVM_LIB_TARGET_AIE_AIEPTRMODOPTIMIZERPASS_H + +#include "AIE.h" +#include "AIEGlobalCombiner.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +namespace llvm::AIE { + +class FoundCombiners { + // Whether FoundCombiners has been generated by an Analysis Pass + bool GeneratedFromAnalysisPass = false; + /// Map between CombineRoot and the corresponding Combiner + std::map InstrCombines; + + /// Keep track for which MBB Combiners were found + std::set MBBWithSolution; + + /// The keys are to be replaced MachineInstructions, and the values the newly + /// inserted Instructions. + std::map ConvertedInstrs; + + /// If a MachineInstr is remapped to a new Instruction through a previous + /// Combiner, update the MachineInstr and \return valid MachineInstructions + std::vector + getRemappedInstrs(std::vector &MIs) const; + + /// remap all the instruction of \p Combiner to valid instructions + void remapCombiner(Combiner &Combiner) const; + +public: + FoundCombiners() {} + FoundCombiners(bool Analysis) : GeneratedFromAnalysisPass(Analysis) {} + FoundCombiners(FoundCombiners &Arg) = delete; + ~FoundCombiners() = default; + + /// Add \p Combiner + void append(const Combiner &CombineResult); + + /// \return Combiner to the \p CombineRoot + Combiner *getCombine(MachineInstr *CombineRoot); + + /// Create Mapping between \p ReplacedMI and \p NewlyInsertedMI + void createMapping(MachineInstr *ReplacedMI, MachineInstr *NewlyInsertedMI); + + /// \return newly Inserted Instruction of \p ReplaceMI + MachineInstr *getMappedInstr(MachineInstr *ReplacedMI) const; + + const std::map &getInstrCombines() const; + + /// \return whether Analysis Pass generated this Object + bool hasAnalysis() const { return GeneratedFromAnalysisPass; } +}; +} // namespace llvm::AIE + +namespace llvm { +raw_ostream &operator<<(raw_ostream &OS, const AIE::FoundCombiners &Val); + +class AIEPtrModOptimizer : public llvm::MachineFunctionPass { + std::unique_ptr PtrModRes; + +public: + static char ID; + AIEPtrModOptimizer() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + StringRef getPassName() const override; + + AIE::FoundCombiners *getGlobalPtrCombiners() { return PtrModRes.get(); } + +private: + void appendResult(std::vector &Combiners); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AIE_AIEPTRMODOPTIMIZERPASS_H diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt index 1748a9e0d63d..f69d955fd6a9 100644 --- a/llvm/lib/Target/AIE/CMakeLists.txt +++ b/llvm/lib/Target/AIE/CMakeLists.txt @@ -98,6 +98,7 @@ add_llvm_target(AIECodeGen AIEEliminateDuplicatePHI.cpp AIEFinalizeBundle.cpp AIEFrameLowering.cpp + AIEGlobalCombiner.cpp AIEHazardRecognizer.cpp AIEHazardRecognizerPRAS.cpp AIEInstrInfo.cpp @@ -118,6 +119,7 @@ add_llvm_target(AIECodeGen AIEPostPipeliner.cpp AIEPostSelectOptimize.cpp AIEPseudoBranchExpansion.cpp + AIEPtrModOptimizer.cpp AIERegClassConstrainer.cpp AIERegMemEventTracker.cpp AIERegisterInfo.cpp diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PPostLegalizerCustomCombiner.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PPostLegalizerCustomCombiner.cpp index 5396024808aa..85412f68db7d 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PPostLegalizerCustomCombiner.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PPostLegalizerCustomCombiner.cpp @@ -45,6 +45,8 @@ class AIE2PPostLegalizerCustomCombinerImpl : public Combiner { protected: // TODO: Make CombinerHelper methods const. mutable CombinerHelper Helper; + AIE::FoundCombiners EmptyGlobalCombiner; + AIE::FoundCombiners *GlobalCombiners = nullptr; const AIE2PPostLegalizerCustomCombinerImplRuleConfig &RuleConfig; const AIE2PSubtarget &STI; @@ -52,6 +54,7 @@ class AIE2PPostLegalizerCustomCombinerImpl : public Combiner { AIE2PPostLegalizerCustomCombinerImpl( MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, GISelKnownBits &KB, GISelCSEInfo *CSEInfo, + AIE::FoundCombiners *GlobalCombiner, const AIE2PPostLegalizerCustomCombinerImplRuleConfig &RuleConfig, const AIE2PSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI); @@ -73,16 +76,19 @@ class AIE2PPostLegalizerCustomCombinerImpl : public Combiner { AIE2PPostLegalizerCustomCombinerImpl::AIE2PPostLegalizerCustomCombinerImpl( MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, GISelKnownBits &KB, GISelCSEInfo *CSEInfo, + AIE::FoundCombiners *GlobalCombiner, const AIE2PPostLegalizerCustomCombinerImplRuleConfig &RuleConfig, const AIE2PSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) : Combiner(MF, CInfo, TPC, &KB, CSEInfo), Helper(Observer, B, /*IsPostLegalize*/ false, &KB, MDT, LI), - RuleConfig(RuleConfig), STI(STI), + GlobalCombiners(GlobalCombiner), RuleConfig(RuleConfig), STI(STI), #define GET_GICOMBINER_CONSTRUCTOR_INITS #include "AIE2PGenPostLegalizerGICustomCombiner.inc" #undef GET_GICOMBINER_CONSTRUCTOR_INITS { + if (!GlobalCombiner) + GlobalCombiners = &EmptyGlobalCombiner; } class AIE2PPostLegalizerCustomCombiner : public MachineFunctionPass { @@ -144,11 +150,16 @@ bool AIE2PPostLegalizerCustomCombiner::runOnMachineFunction( GISelKnownBits *KB = &getAnalysis().get(MF); MachineDominatorTree *MDT = &getAnalysis(); + AIE::FoundCombiners *AIEGlobalPtrIncResults = nullptr; + if (auto *PtrModOptPass = getAnalysisIfAvailable()) + AIEGlobalPtrIncResults = PtrModOptPass->getGlobalPtrCombiners(); + CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); AIE2PPostLegalizerCustomCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, - RuleConfig, ST, MDT, LI); + AIEGlobalPtrIncResults, RuleConfig, + ST, MDT, LI); return Impl.combineMachineInstrs(); } diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp index ab0158f21334..6f9f976e4950 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp @@ -22,6 +22,7 @@ extern cl::opt EnableSuperRegSplitting; extern cl::opt AllocateMRegsFirst; extern cl::opt EnablePreMISchedCoalescer; extern cl::opt EnableAddressChaining; +extern cl::opt EnableGlobalPtrModOptimizer; extern cl::opt EnableWAWRegRewrite; void AIE2PTargetMachine::anchor() {} @@ -62,6 +63,8 @@ void AIE2PPassConfig::addPreRegBankSelect() { addPass(createAIE2PPostLegalizerGenericCombiner()); if (EnableAddressChaining) addPass(createAIEClusterBaseAddress()); + if (EnableGlobalPtrModOptimizer) + addPass(createAIEPtrModOptimizer()); addPass(createAIE2PPostLegalizerCustomCombiner()); } } From 59af7562b360fae87459e5ed74c3f61372635d0d Mon Sep 17 00:00:00 2001 From: Fabian Stuckmann Date: Wed, 21 May 2025 03:14:12 -0600 Subject: [PATCH 4/6] [AIEX] added greedy global combiner search --- .../AIE/AIE2PostLegalizerCustomCombiner.cpp | 6 +- llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp | 2 +- llvm/lib/Target/AIE/AIECombine.td | 23 +- llvm/lib/Target/AIE/AIEGlobalCombiner.cpp | 630 +++++++++++++- llvm/lib/Target/AIE/AIEGlobalCombiner.h | 309 +++++++ .../Target/AIE/AIEGlobalCombinerPtrMods.cpp | 458 ++++++++++ .../lib/Target/AIE/AIEGlobalCombinerPtrMods.h | 183 ++++ llvm/lib/Target/AIE/AIEInterBlockScheduling.h | 6 +- llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp | 41 +- llvm/lib/Target/AIE/AIETargetMachine.cpp | 1 + llvm/lib/Target/AIE/CMakeLists.txt | 1 + .../AIE2PPostLegalizerCustomCombiner.cpp | 5 + .../GlobalISel/addrspace-before-selection.ll | 16 +- .../AIE/GlobalISel/combine-loads-stores.mir | 164 ++-- .../AIE/GlobalISel/combine-split-large-ls.mir | 26 +- .../GlobalISel/indexed-512-load-store.mir | 46 +- .../CodeGen/AIE/aie2/conv2d_offset_test.ll | 34 +- .../CodeGen/AIE/aie2/end-to-end/Add2D-red.ll | 115 +-- .../CodeGen/AIE/aie2/end-to-end/Memops.ll | 39 +- .../CodeGen/AIE/aie2/llc-pipeline-aie2.ll | 1 + llvm/test/CodeGen/AIE/aie2/mmo-load.ll | 4 +- llvm/test/CodeGen/AIE/aie2/movxm_test.ll | 15 +- .../global-combiners/MBB-crossing.mir | 13 +- .../GlobalIsel/global-combiners/gemm.mir | 12 +- .../global-combiners/greedy-log.mir | 58 ++ .../mixture-offset-postinc-selection.mir | 1 + .../global-combiners/movability-check.mir | 71 +- .../global-combiners/overlap-gain.mir | 3 +- .../global-combiners/post-inc-eagerness.mir | 15 +- .../global-combiners/reorder-Mem-Instrs.mir | 10 +- .../shared-postinc-constants.mir | 1 + .../global-combiners/user-intrinsics.mir | 13 +- llvm/test/CodeGen/AIE/aie2p/Memops.ll | 45 +- .../AIE/aie2p/combine-loads-stores.mir | 191 +++-- .../aie2p/end-to-end/conv2d_bfp16_convert.ll | 14 +- llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll | 339 ++++---- .../CodeGen/AIE/aie2p/ldst-fifo-stores.ll | 220 ++--- .../CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll | 2 + .../CodeGen/AIE/aie2p/load-store-unaligned.ll | 780 ++++++++++-------- 39 files changed, 2876 insertions(+), 1037 deletions(-) create mode 100644 llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.cpp create mode 100644 llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.h create mode 100644 llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/greedy-log.mir diff --git a/llvm/lib/Target/AIE/AIE2PostLegalizerCustomCombiner.cpp b/llvm/lib/Target/AIE/AIE2PostLegalizerCustomCombiner.cpp index e8666b41cd2c..918361fa8edd 100644 --- a/llvm/lib/Target/AIE/AIE2PostLegalizerCustomCombiner.cpp +++ b/llvm/lib/Target/AIE/AIE2PostLegalizerCustomCombiner.cpp @@ -33,6 +33,8 @@ using namespace llvm; +extern cl::opt EnableGlobalPtrModOptimizer; + static const char AIE2_POSTLEGALIZER_CUSTOM_COMBINER[] = "AIE2 Post Legalizer Custom Combiner"; @@ -113,7 +115,9 @@ class AIE2PostLegalizerCustomCombiner : public MachineFunctionPass { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); - AU.addPreserved(); + if (EnableGlobalPtrModOptimizer) { + AU.addRequired(); + } MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp index 1fddc94a4bf0..bfab28cd155e 100644 --- a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp @@ -74,7 +74,7 @@ cl::opt EnableAddressChaining("aie-address-chaining", cl::Hidden, cl::desc("Enable ptradd chaining.")); cl::opt EnableGlobalPtrModOptimizer( - "aie-global-ptr-mod-opt", cl::Hidden, cl::init(false), + "aie-global-ptr-mod-opt", cl::Hidden, cl::init(true), cl::desc("Enable global pointer modifier optimization.")); cl::opt diff --git a/llvm/lib/Target/AIE/AIECombine.td b/llvm/lib/Target/AIE/AIECombine.td index cc672a53a23b..69d8c9c2079e 100644 --- a/llvm/lib/Target/AIE/AIECombine.td +++ b/llvm/lib/Target/AIE/AIECombine.td @@ -275,8 +275,18 @@ def combine_offset_load_store_share_ptradd : GICombineRule< (apply [{ applyOffsetLoadStoreSharePtrAdd(*${root}, MRI, B, ${matchinfo}); }]) >; + +def combine_global_load_store_increment : GICombineRule < + (defs root:$root), + (match (wip_match_opcode G_LOAD, G_ZEXTLOAD, G_SEXTLOAD, G_STORE):$root, + [{ return matchGlobalPtrModOptimizer(*${root}, MRI, Helper, B.getTII(), GlobalCombiners); }]), + (apply [{ applyLdStInc(*${root}, MRI, Helper, B, Observer, GlobalCombiners); }] )>; + + + def AIE2PostLegalizerCustomCombiner - : GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_load_store_split, + : GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_global_load_store_increment, + combine_load_store_split, ptr_add_immed_chain, combine_load_store_increment, combine_offset_load_store_ptradd, @@ -289,9 +299,10 @@ def AIE2PostLegalizerCustomCombiner } def AIE2PPostLegalizerCustomCombiner - : GICombiner<"AIE2PPostLegalizerCustomCombinerImpl", [ combine_load_store_increment, - ptr_add_immed_chain, - combine_offset_load_store_ptradd, - combine_offset_load_store_share_ptradd, - combine_add_vector_elt_undef ]> { + : GICombiner<"AIE2PPostLegalizerCustomCombinerImpl", [ combine_global_load_store_increment, + combine_load_store_increment, + ptr_add_immed_chain, + combine_offset_load_store_ptradd, + combine_offset_load_store_share_ptradd, + combine_add_vector_elt_undef ]> { } diff --git a/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp b/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp index 5043d02df1b1..59448d9f5862 100644 --- a/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp +++ b/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp @@ -13,8 +13,543 @@ //===----------------------------------------------------------------------===// #include "AIEGlobalCombiner.h" +#include "AIECombinerHelper.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/IR/IntrinsicsAIE2P.h" +#include "llvm/Support/Debug.h" +#include +#include namespace llvm::AIE { +using std::distance; + +#define DEBUG_TYPE "global-combiner" + +static cl::opt MaxSearchIterationCount("global-combiner-max-search-iter", + cl::Hidden, cl::init(100000), + cl::desc("Maximum Search Tries.")); + +std::vector +AIEGlobalCombiner::getCombiners(MachineBasicBlock &MBB) { + generateCombiners(MBB); + + std::vector Combiners = findBeneficialCombiners(); + + FixedCombiners.insert(FixedCombiners.end(), Combiners.begin(), + Combiners.end()); + return Combiners; +} + +void AIEGlobalCombiner::generateCombiners(MachineBasicBlock &MBB) { + clear(); + + GlobalCombiner::initDAG(DAG, MBB); + + for (auto &SUnit : DAG.SUnits) { + + for (auto *Combiner : CombinerOptions) { + if (!Combiner->isCombineRootCandidate(SUnit.getInstr())) + continue; + + calculateCombineCandidates(SUnit, Combiner); + } + } + + calculateCombinerConflicts(); +} + +std::vector +AIEGlobalCombiner::findBeneficialCombiners() { + std::vector FoundCombiners; + + for (auto CombineCandidates : getCombineCandidates(ClusteredCombiners)) { + LLVM_DEBUG(dbgs() << "Next Cluster Start\n"); + + CombineCandidates.filterOut(FixedCombiners); + CombineCandidates.filterOut(FoundCombiners); + + for (auto *Combiner : + CombineCandidates.searchCombinerSet(OwnedCombineCandidates)) + FoundCombiners.push_back(Combiner); + } + + LLVM_DEBUG(dbgs() << "[Global Combiner] Found " << FoundCombiners.size() + << " Fixed Combiners\n\n"); + + return FoundCombiners; +} + +void AIEGlobalCombiner::calculateCombineCandidates( + SUnit &CombineRoot, const GenericCombiner *Combiner) { + assert(MDT); + + LLVM_DEBUG(dbgs() << " [" << Combiner->getName() << "] " + << *CombineRoot.getInstr()); + + for (std::unique_ptr &Combiner : + Combiner->applyCombiner(CombineRoot, *MDT, DAG, *MRI, *TII)) { + Combiner->setGlobalID(OwnedCombineCandidates.size()); + OwnedCombineCandidates.push_back(std::move(Combiner)); + + GenericCombiner *CurrentCombiner = OwnedCombineCandidates.back().get(); + const auto BaseRegister = getClusterBaseRegister(*CurrentCombiner); + ClusteredCombiners[BaseRegister].append(CurrentCombiner); + } + LLVM_DEBUG(dbgs() << "\n"); +} + +void AIEGlobalCombiner::calculateCombinerConflicts() { + for (unsigned Idx = 0; Idx < OwnedCombineCandidates.size(); Idx++) { + assert(Idx == OwnedCombineCandidates[Idx].get()->getGlobalID()); + OwnedCombineCandidates[Idx]->setConflicts(OwnedCombineCandidates); + } +} + +std::vector> GenericCombiner::applyCombiner( + SUnit &CombineRootSUnit, const MachineDominatorTree &MDT, + const AIE::DataDependenceHelper &DAG, const MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII) const { + std::vector> AppliedCombiners; + CombinerGain ZeroGain; + + for (auto *DefI : getCombineCandidates(CombineRootSUnit.getInstr(), DAG)) { + if (!DefI) { + LLVM_DEBUG(dbgs() << " [GCombiner] Skipping: Could not get DefInstr for " + << *CombineRootSUnit.getInstr()); + continue; + } + + if (isTriviallyDead(*DefI, MRI)) { + LLVM_DEBUG(dbgs() << " [GCombiner] Skipping: Trivially dead " << *DefI); + continue; + } + + std::unique_ptr Combiner = this->clone(); + const bool SetupSuccess = + Combiner->setupCombiner({DefI, CombineRootSUnit.getInstr()}, &DAG); + if (!SetupSuccess) { + LLVM_DEBUG(dbgs() << " [GCombiner] Skipping; Could not setup Combiner " + << *DefI << "\n"); + continue; + } + + if (!Combiner->tryToSetCombinedOpCode()) { + LLVM_DEBUG(dbgs() << " [GCombiner] Opcode is not valid for " << *DefI + << "\n"); + continue; + } + + if (!Combiner->canReorderCombineInstrs(DAG, MRI, TII)) { + LLVM_DEBUG(dbgs() << " [GCombiner] Could not move Uses after " + << *DefI << "\n"); + continue; + } + Combiner->adjustGain(MDT); + if (Combiner->getGain() < ZeroGain) { + LLVM_DEBUG(dbgs() << " [GCombiner] Negative Gain for " << *DefI + << "\n"); + continue; + } + + LLVM_DEBUG(dbgs() << " [GCombiner] Found Combiner: "; + Combiner->dumpFull();); + AppliedCombiners.push_back(std::move(Combiner)); + } + return AppliedCombiners; +} + +Register AIEGlobalCombiner::getClusterBaseRegister(GenericCombiner &Combiner) { + std::optional NewBaseReg; + for (auto &[BaseReg, OverlappingRegs] : BaseRegisterMap) { + for (const auto &Reg : Combiner.getClusterRegs()) + if (find(OverlappingRegs, Reg) != OverlappingRegs.end()) { + NewBaseReg = BaseReg; + break; + } + if (NewBaseReg) + break; + } + + if (!NewBaseReg) { + NewBaseReg = Combiner.getClusterRegs()[0]; + BaseRegisterMap[*NewBaseReg] = {}; + } + std::vector &OverlappingRegs = BaseRegisterMap[*NewBaseReg]; + + for (const auto &Reg : Combiner.getClusterRegs()) + if (find(OverlappingRegs, Reg) == OverlappingRegs.end()) + OverlappingRegs.push_back(Reg); + + return *NewBaseReg; +} + +void AIEGlobalCombiner::clear() { ClusteredCombiners.clear(); } + +std::vector AIEGlobalCombiner::getCombineCandidates( + std::map ClusteredCombiners) { + std::vector Result; + for (auto Candidate : ClusteredCombiners) { + Result.push_back(Candidate.second); + } + + return Result; +} + +// -------------------------- CombineCandidates ------------------------------// + +std::vector CombineCandidates::searchCombinerSet( + const std::vector> + &OwnedCombineCandidates) { + if (Combiners.empty()) + return {}; + + const auto *MBB = Combiners[0]->CombinerData.CombineRoot->getParent(); + const unsigned NumCombiner = Combiners[0]->getConflicts().size(); + LLVM_DEBUG(dbgs() << MBB->getName() << " - Combiner Search Start \n"); + + // seed greedy solution + auto BestSolution = getGreedySolution(); + + std::vector Result; + BitVector CombinerBitVec = BestSolution.getCombinersBitVector(); + for (int Idx = CombinerBitVec.find_first(); Idx != -1; + Idx = CombinerBitVec.find_next(Idx)) { + Result.push_back(Combiners[Idx]); + } + return Result; +} + +void CombineCandidates::append(GenericCombiner *Combiner) { + Combiners.push_back(Combiner); +} + +CombinerSolution CombineCandidates::getGreedySolution() const { + unsigned NumCombiner = Combiners[0]->getConflicts().size(); + + BitVector Conflicts(NumCombiner); + CombinerSolution GreedySolution(NumCombiner); + CombinerGain ZeroGain; + + for (unsigned Idx = 0; Idx < Combiners.size(); Idx++) { + const auto *Combiner = Combiners[Idx]; + if (Combiner->hasConflict(Conflicts)) + continue; + + if (Combiner->getOverlapGain(GreedySolution.getCombinersBitVector(), + Combiners) <= ZeroGain) + continue; + + LLVM_DEBUG(dbgs() << "[Greedy] Added " << *Combiner << "\n"); + GreedySolution.add(Combiner, Idx, Combiners); + Conflicts |= Combiner->getConflicts(); + } + + GreedySolution.setIndex(Combiners.size()); + return GreedySolution; +} + +CombinerGain +CombineCandidates::getMaxPotentialGain(const CombinerSolution &Current, + const unsigned Index) const { + CombinerGain ZeroGain; + CombinerGain NextGain(Current.getGain()); + for (unsigned Idx = Index; Idx < Combiners.size(); Idx++) { + GenericCombiner *Combiner = Combiners[Idx]; + if (Current.hasConflict(Combiner)) + continue; + + auto OverlapGain = + Combiner->getOverlapGain(Current.getCombinersBitVector(), Combiners); + if (OverlapGain > ZeroGain) + NextGain += OverlapGain; + } + + return NextGain; +} + +void CombineCandidates::filterOut( + const std::vector &UsedCombiners) { + for (const auto &Combiner : UsedCombiners) { + auto HasConflict = [&](const GenericCombiner *Obj) { + return Combiner->hasConflict(/*ConflictVector=*/Obj->getConflicts()); + }; + erase_if(Combiners, HasConflict); + } + + LLVM_DEBUG(dbgs() << " [Search] After Filtering left with " + << Combiners.size() << " Options\n"); +} + +// -------------------------- CombineResult ----------------------------------// + +bool GenericCombiner::canReorderCombineInstrs( + const AIE::DataDependenceHelper &DAG, const MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII) { + assert(CombinerData.CombineInstrs.size() > 0); + auto ReorderInstrs = [](std::set &ToMove) { + std::vector ToMoveVec; + + for (SUnit *Candidate : ToMove) { + ToMoveVec.push_back(Candidate); + } + + // maintain dependencies between SUnits by keeping the ordering of the MBB + sort(ToMoveVec, [](const SUnit *A, const SUnit *B) { + return A->NodeNum < B->NodeNum; + }); + + std::vector OrderedMIs; + for (auto *SUnit : ToMoveVec) { + OrderedMIs.push_back(SUnit->getInstr()); + } + + return OrderedMIs; + }; + + auto MoveAbleInstrs = getInstructionsToMove(DAG); + if (!MoveAbleInstrs) + return false; + + const unsigned InsertionPointDepth = + DAG.getSUnit(CombinerData.InsertionPoint)->getDepth(); + + auto [MoveUp, MoveDown] = *MoveAbleInstrs; + + for (SUnit *MoveDownCandidate : MoveDown) { + std::set ToMove; + if (!isMovable(*MoveDownCandidate, InsertionPointDepth, DAG, + /*MoveDown=*/true, ToMove)) + return false; + CombinerData.DelayInstrPastInsertionPoint = ReorderInstrs(ToMove); + } + + for (SUnit *MoveUpCandidate : MoveUp) { + std::set ToMove; + if (!isMovable(*MoveUpCandidate, InsertionPointDepth, DAG, + /*MoveDown=*/false, ToMove)) + return false; + CombinerData.MoveUpInstrsToInsertionPoint = ReorderInstrs(ToMove); + } + + return true; +} + +std::optional, std::vector>> +GenericCombiner::getInstructionsToMove(const AIE::DataDependenceHelper &DAG) { + assert(InsertionPointNodeNum != (unsigned)-1); + std::vector MoveUp; + std::vector MoveDown; + + for (auto *MI : CombinerData.CombineInstrs) { + if (MI == CombinerData.InsertionPoint) + continue; + + auto *SUnit = DAG.getSUnit(MI); + if (!SUnit) + // Instruction is in a different MBB. + // Fixme: properly check if MI can be moved across MBB borders + return {}; + + const auto MINodeNum = SUnit->NodeNum; + if (MINodeNum > InsertionPointNodeNum) { + LLVM_DEBUG(dbgs() << "Check Move up " << *MI); + MoveUp.push_back(SUnit); + } else { + LLVM_DEBUG(dbgs() << "Check Move down " << *MI); + MoveDown.push_back(SUnit); + } + } + return {{MoveUp, MoveDown}}; +} + +bool GenericCombiner::isMovable(SUnit &Candidate, + const unsigned InsertionPointDepth, + const AIE::DataDependenceHelper &DAG, + const bool MoveDown, + std::set &ToMove) { + + auto GetNext = [MoveDown](SUnit &Candidate) { + if (MoveDown) { + return Candidate.Succs; + } + + return Candidate.Preds; + }; + + for (auto &Dep : GetNext(Candidate)) { + MachineInstr *DepI = Dep.getSUnit()->getInstr(); + if (DepI == CombinerData.InsertionPoint) { + LLVM_DEBUG(dbgs() << "[Movability Check] Dependency to Insertion Point, " + "skipping Combiner " + << *DepI); + return false; + } + + if (!canMove(Dep.getSUnit(), MoveDown)) { + LLVM_DEBUG( + dbgs() + << "[Movability Check] Combiner does not know how to handle move, " + "skipping Combiner " + << *DepI); + return false; + } + + if (ToMove.count(Dep.getSUnit())) { + LLVM_DEBUG(dbgs() << "[Movability Check] Already encountered, no need " + "to continue movability check for " + << *DepI); + continue; + } + + ToMove.emplace(Dep.getSUnit()); + LLVM_DEBUG(dbgs() << "[Movability Check] Adding " << *DepI); + const bool Movable = + isMovable(*Dep.getSUnit(), InsertionPointDepth, DAG, MoveDown, ToMove); + if (!Movable) + return false; + } + return true; +} + +void GenericCombiner::setConflicts( + std::vector> &AllCombiners) { + LLVM_DEBUG(dbgs() << "Conflict check " << *this << "\n"); + ConflictCombiners.resize(AllCombiners.size()); + + for (unsigned Idx = 0; Idx < AllCombiners.size(); Idx++) { + if (hasConflict(*AllCombiners[Idx])) + ConflictCombiners.set(Idx); + } +} + +unsigned GenericCombiner::getGlobalID() const { return GlobalID; } + +void GenericCombiner::setGlobalID(unsigned GlobalID) { + this->GlobalID = GlobalID; +} + +/// \return whether a Combiner is used after a Remove-Combiner, that +/// are part of the same Cluster. The Ordering of the Combiners \p A and \p B is +/// irrelevant. +static bool hasUseAfterRemoval(const GenericCombiner &A, + const GenericCombiner &B, const unsigned AIdx, + const MachineInstr *MI) { + auto FindIndex = [](const GenericCombiner &Combiner, const MachineInstr *MI) { + // get Index of MI in CombineInstrs + auto It = find(Combiner.CombinerData.CombineInstrs, MI); + if (It == Combiner.CombinerData.CombineInstrs.end()) + return -1; + + return (int)std::distance(Combiner.CombinerData.CombineInstrs.begin(), It); + }; + + // early exit, if Combiners don't exactly overlap + for (auto &Reg : A.getClusterRegs()) { + if (find(B.getClusterRegs(), Reg) == B.getClusterRegs().end()) + return false; + } + + int BIdx = FindIndex(B, MI); + if (BIdx == -1) + return false; + + if (!A.CombinerData.RemoveInstrs[AIdx] && !B.CombinerData.RemoveInstrs[BIdx]) + return false; + + if (A.CombinerData.RemoveInstrs[AIdx] && B.CombinerData.RemoveInstrs[BIdx]) + return true; + + auto RemovalInsertionPointNodeNum = A.InsertionPointNodeNum; + auto NonRemovalInsertionPointNodeNum = B.InsertionPointNodeNum; + if (B.CombinerData.RemoveInstrs[BIdx]) { + // B is the removal Combiner, swap Node Nums + RemovalInsertionPointNodeNum = B.InsertionPointNodeNum; + NonRemovalInsertionPointNodeNum = A.InsertionPointNodeNum; + } + + return NonRemovalInsertionPointNodeNum > RemovalInsertionPointNodeNum; +} + +bool GenericCombiner::hasConflict(const GenericCombiner &Combiner) const { + for (unsigned Idx = 0; Idx < Combiner.CombinerData.CombineInstrs.size(); + Idx++) { + auto *MI = Combiner.CombinerData.CombineInstrs[Idx]; + if (isCombineRoot(MI) || + (contains(*MI) && hasUseAfterRemoval(*this, Combiner, Idx, MI))) { + LLVM_DEBUG(dbgs() << " [Conflict] " << *this << " " << Combiner + << "\n"); + return true; + } + } + + return false; +} + +bool GenericCombiner::hasConflict(const BitVector &ConflictVector) const { + return ConflictVector.test(GlobalID); +} + +CombinerGain GenericCombiner::getOverlapGain( + const BitVector &AlreadyUsedCombiners, + const std::vector &AllCombiners) const { + std::vector MaterializedImmCopies; + + auto AddImmRegUses = [&MaterializedImmCopies]( + const GenericCombiner *Combiner) { + assert(Combiner); + for (auto ImmInReg : Combiner->getAllImmInRegs()) { + if (find(MaterializedImmCopies, ImmInReg) == MaterializedImmCopies.end()) + MaterializedImmCopies.push_back(ImmInReg); + } + }; + + CombinerGain OverLapGain(getGain()); + for (int Idx = AlreadyUsedCombiners.find_first(); Idx != -1; + Idx = AlreadyUsedCombiners.find_next(Idx)) { + const GenericCombiner *UsedCombiner = AllCombiners.at(Idx); + AddImmRegUses(UsedCombiner); + OverLapGain -= getOverlapPenalty(UsedCombiner); + } + + OverLapGain += getImmediateReuseGain(MaterializedImmCopies); + + return OverLapGain; +} + +unsigned GenericCombiner::getCombinedOpCode() const { + return CombinerData.CombinedInstrOpcode; +} + +bool GenericCombiner::contains(MachineInstr &MI) const { + auto It = + find_if(CombinerData.CombineInstrs, [&MI](const MachineInstr *CombineMI) { + return CombineMI == &MI; + }); + + return It != CombinerData.CombineInstrs.end(); +} + +bool GenericCombiner::isCombineRoot(const MachineInstr *MI) const { + return CombinerData.CombineRoot == MI; +} + +void GenericCombiner::dumpFull() const { + dbgs() << *this << "\n"; + CombinerData.dumpFull(); +} + +raw_ostream &operator<<(raw_ostream &OS, const GenericCombiner &Val) { + OS << Val.getName() << "[" << Val.getGlobalID() << "]" + << " ("; + for (auto *MI : Val.CombinerData.CombineInstrs) { + OS << MI->getOperand(0) << ","; + } + OS << ") "; + OS << Val.getGain() << " "; + + return OS; +} // -------------------------- CombinerGain -----------------------------------// @@ -30,13 +565,11 @@ void CombinerGain::reset() { } bool CombinerGain::operator>(const CombinerGain &Rhs) const { - for (unsigned Idx = 0; Idx < GainVector.size(); Idx++) { - if (GainVector[Idx] == Rhs.GainVector[Idx]) - continue; + return Rhs < *this; +} - return GainVector[Idx] > Rhs.GainVector[Idx]; - } - return false; +bool CombinerGain::operator<=(const CombinerGain &Rhs) const { + return !(*this > Rhs); } bool CombinerGain::operator<(const CombinerGain &Rhs) const { @@ -78,6 +611,65 @@ CombinerGain CombinerGain::operator-(const CombinerGain &Rhs) const { return Result -= Rhs; } +// -------------------------- CombinerSolution -------------------------------// + +CombinerSolution::CombinerSolution(const unsigned NumCombiners) + : Index(0), Combiners(NumCombiners), ConflictCombiners(NumCombiners) {} + +CombinerSolution::CombinerSolution( + const CombinerSolution &Other, const GenericCombiner *Combiner, + const CombinerGain &MaxFutureGain, const unsigned Idx, + const std::vector &CombinerSubSet) + : CombinerSolution(Other) { + + Index++; + this->MaxFutureGain = MaxFutureGain; + add(Combiner, Idx, CombinerSubSet); +} + +void CombinerSolution::add( + const GenericCombiner *Combiner, const unsigned Idx, + const std::vector &CombinerSubSet) { + if (!Combiner) + return; + + Gain += Combiner->getOverlapGain(Combiners, CombinerSubSet); + Combiners.set(Idx); + ConflictCombiners |= Combiner->getConflicts(); + assert(Combiners.size() == ConflictCombiners.size()); +} + +void CombinerSolution::remove(const int Idx) { Combiners.reset(Idx); } + +void CombinerSolution::recalculateGain( + const std::vector &AllCombiners) { + Gain.reset(); + for (int Idx = Combiners.find_first(); Idx != -1; + Idx = Combiners.find_next(Idx)) { + Gain += AllCombiners[Idx]->getOverlapGain(Combiners, AllCombiners); + } +} + +const BitVector &CombinerSolution::getCombinersBitVector() const { + return Combiners; +} + +bool CombinerSolution::hasConflict(const GenericCombiner *Combiner) const { + return ConflictCombiners[Combiner->getGlobalID()]; +} + +bool CombinerSolution::operator<(const CombinerSolution &Other) const { + if (MaxFutureGain == Other.MaxFutureGain) + // If Potential Future Gains are equal, sort by remaining search steps + return Index < Other.Index; + + return MaxFutureGain < Other.MaxFutureGain; +} + +bool CombinerSolution::operator==(const CombinerSolution &Other) const { + return Index == Other.Index && Combiners == Other.Combiners; +} + // --------------------------- Combiner --------------------------------------// Combiner::Combiner(std::vector CombineInstrs, @@ -126,6 +718,18 @@ void Combiner::dumpFull(unsigned *GlobalID, CombinerGain *Gain) const { // ---------------------------------------------------------------------------// +void GlobalCombiner::initDAG(AIE::DataDependenceHelper &DAG, + MachineBasicBlock &MBB) { + DAG.clearDAG(); + for (auto &MI : MBB) { + if (!MI.isTerminator()) { + DAG.initSUnit(MI); + } + } + DAG.buildEdges(); + DAG.makeMaps(); +} + raw_ostream &operator<<(raw_ostream &OS, const CombinerGain &Val) { OS << "["; for (auto &Gain : Val.GainVector) { @@ -144,4 +748,16 @@ raw_ostream &operator<<(raw_ostream &OS, const Combiner &Val) { return OS; } -}; // namespace llvm::AIE +raw_ostream &operator<<(raw_ostream &OS, const CombinerSolution &Val) { + OS << "[" << Val.getIndex() << "]"; //=" << Val.getScore() << " "; + BitVector CombinerVector = Val.getCombinersBitVector(); + for (int Idx = CombinerVector.find_first(); Idx != -1; + Idx = CombinerVector.find_next(Idx)) { + dbgs() << Idx << " "; + } + LLVM_DEBUG(dbgs() << Val.getGain()); + + return OS; +} + +} // namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIEGlobalCombiner.h b/llvm/lib/Target/AIE/AIEGlobalCombiner.h index 5453c9313a21..955417ca0e2a 100644 --- a/llvm/lib/Target/AIE/AIEGlobalCombiner.h +++ b/llvm/lib/Target/AIE/AIEGlobalCombiner.h @@ -100,12 +100,321 @@ class Combiner { raw_ostream &operator<<(raw_ostream &OS, const Combiner &Val); +/// Abstract Combiner Class. Every Combiner should derive from this and +/// implement the necessary methods. class GenericCombiner { + /// \return whether Candidate is Movable in the direction \p MoveDown to + /// \p InsertionPointDepth . The Instructions are collected in \p ToMove . + bool isMovable(SUnit &Candidate, const unsigned InsertionPointDepth, + const AIE::DataDependenceHelper &DAG, const bool MoveDown, + std::set &ToMove); + + /// \return whether Combine Instructions can be reordered and the Combiner is + /// valid + bool canReorderCombineInstrs(const AIE::DataDependenceHelper &DAG, + const MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII); + +protected: + /// Collect all Immediate Values that have to be stored in Registers, because + /// the bitencoding does not allow it to be encoded in the Instruction + std::vector ImmInRegs; + + /// Unique ID for the Combiner. It is used to keep track of Conflicts. + unsigned GlobalID = -1; + /// Keeps track of Combiners that have conflicts with this Combiner. A + /// Combiner is encoded through its GlobalID. + BitVector ConflictCombiners; + + /// \return SUnits that have to be moved up or down before the InsertionPoint + /// : . \pre InsertionPoint has been set + virtual std::optional, std::vector>> + getInstructionsToMove(const AIE::DataDependenceHelper &DAG); + + /// Set the OpCode for the Combiner. \return whether the Opcode can be set + virtual bool tryToSetCombinedOpCode() = 0; + + /// Set the InsertionPoint of the Combiner, i.e. the position, where the + /// CombinedOpcode Instruction should be inserted + virtual void setInsertionPoint() = 0; + + /// Set the Gain for the Combiner + virtual void adjustGain(const MachineDominatorTree &MDT) = 0; + + /// \return Immediate that don't fit into the immediate Bit Encoding and have + /// to be stored in a Register + virtual std::vector getAllImmInRegs() const { return ImmInRegs; }; + + /// \return Overlap Penalty with \p Combiner . Note: Only subtract the + /// Penalty. + virtual CombinerGain + getOverlapPenalty(const GenericCombiner *Combiner) const = 0; + + /// \return Gain of an Immediate that can be reused from \p UsedImmediates + virtual CombinerGain + getImmediateReuseGain(const std::vector &UsedImmediates) const = 0; + + /// \return If Combiner can handle gain and conflict calculation if + /// \p Candidate is moved in the direction \p MoveDown + virtual bool canMove(const SUnit *Candidate, const bool MoveDown) const = 0; + public: + /// Internal data structure of the Combiner Combiner CombinerData; + /// CombineRoot Node Number in the DAG + unsigned CombineRootNodeNum = -1; + /// Node Number in DAG of InsertionPoint + unsigned InsertionPointNodeNum = -1; + /// Keep track of the Node Numbers of the CombineInstrs in the DAG + std::vector CombineInstrNodeNum; + + GenericCombiner() = default; + GenericCombiner(const GenericCombiner &Other) = default; + GenericCombiner(StringRef Name) { CombinerData.Name = Name; } + + virtual ~GenericCombiner() = default; + + /// \return A clone of this Combiner. + /// Note: This is used on the template (empty) Combiner. + virtual std::unique_ptr clone() const = 0; + + /// \return All Combiners that can be found with the same \p CombineRoot . + /// Note: This is used on the template (empty) Combiner. + std::vector> + applyCombiner(SUnit &CombineRoot, const MachineDominatorTree &MDT, + const AIE::DataDependenceHelper &DAG, + const MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII) const; + + /// \return OpCode of the to-be-inserted Instruction + unsigned getCombinedOpCode() const; + + /// \return MachineInstrs that can be used to form Combiners, based on + /// \p OriginInstr . Each combination of MachineInstr with \p OriginInstr + /// could be an individual Combiner + virtual std::vector + getCombineCandidates(MachineInstr *OriginInstr, + const AIE::DataDependenceHelper &DAG) const = 0; + + /// Set up the Combiner with \p CombineInstrs . + /// \return whether the setup was successful. + /// This method is used if the Combiner is cloned from an empty (default) + /// Combiner. + virtual bool setupCombiner(std::vector CombineInstrs, + const AIE::DataDependenceHelper *DAG) = 0; + + /// \return Name of the Combiner. + StringRef getName() const { return CombinerData.Name; }; + + /// Dump all the relevant information of the Combiner. + void dumpFull() const; + + /// \return whether \p MI can be a CombineRoot of the Combiner. + virtual bool isCombineRootCandidate(const MachineInstr *MI) const = 0; + + /// \return whether \p is the CombineRoot + bool isCombineRoot(const MachineInstr *MI) const; + + /// \return the Gain that applying the combiner would incur. The gain is + /// maximized. + virtual const CombinerGain &getGain() const = 0; + + /// \return Gain when \p AlreadyUsedCombiners are already selected. A + /// position in \p AlreadyUsedCombiners corresponds to the position in + /// \p CombinerSubSet + CombinerGain + getOverlapGain(const BitVector &AlreadyUsedCombiners, + const std::vector &CombinerSubSet) const; + + /// \return the Registers by which different Combiners can be clustered + virtual const std::vector getClusterRegs() const = 0; + + /// \return whether Combiner contains \p MI in the CombineInstrs + bool contains(MachineInstr &MI) const; + + /// \return whether this CombineResult conflicts with \p Combiner , i.e. + /// both cannot be applied together + bool hasConflict(const GenericCombiner &Combiner) const; + + /// \return whether this Combiner has a Conflict with \p ConflictVector + bool hasConflict(const BitVector &ConflictVector) const; + + /// \return BitVector containing all the conflicting Combiners with this + /// Combiner. Each Combiner position in the Bitvector is determined by + /// GlobalID. + const BitVector &getConflicts() const { return ConflictCombiners; } + + /// Statically set Conflicts in ConflictBits for each Combiner in + /// \p AllCombiners that conflicts with this combiner. + void + setConflicts(std::vector> &AllCombiners); + + /// \return GlobalID of this Combiner, representing the total number + /// of Combiners created previously. + unsigned getGlobalID() const; + + /// Set unique Identifier for this Combiner to \p GlobalID + void setGlobalID(unsigned GlobalID); }; +raw_ostream &operator<<(raw_ostream &OS, const GenericCombiner &Val); + +/// Helper Struct to help searching for a good Combinerset +class CombinerSolution { + /// Next to considered Combiner in the list of all Combiners + unsigned Index = 0; + /// Gain of the Solution + CombinerGain Gain; + /// Potential maximum future gain for this Solution + CombinerGain MaxFutureGain; + /// Applied Combiners to get the Solution + BitVector Combiners; + /// Conflicting Combiners with the Solution + BitVector ConflictCombiners; + +public: + CombinerSolution() = default; + CombinerSolution(const CombinerSolution &Other) = default; + /// Only Initialize the ConflictCombiners + CombinerSolution(const unsigned NumCombiners); + + /// \p Combiner is at position \p Idx in the \p CombinerSubSet + CombinerSolution(const CombinerSolution &Other, + const GenericCombiner *Combiner, + const CombinerGain &MaxFutureGain, const unsigned Idx, + const std::vector &CombinerSubSet); + + unsigned getIndex() const { return Index; } + + void setIndex(unsigned Index) { this->Index = Index; } + + const CombinerGain &getGain() const { return Gain; } + + void setMaxFutureGain(const CombinerGain &Gain) { MaxFutureGain = Gain; } + + const CombinerGain &getMaxFutureGain() const { return MaxFutureGain; } + + bool hasConflict(const GenericCombiner *Combiner) const; + + /// Add \p Combiner to Solution. \p Combiner is at position \p Idx in the + /// \p CombinerSubSet + void add(const GenericCombiner *Combiner, const unsigned Idx, + const std::vector &CombinerSubSet); + + /// Remove Combiner at Position \p Idx + void remove(const int Idx); + + void recalculateGain(const std::vector &AllCombiners); + + const BitVector &getCombinersBitVector() const; + + const BitVector &getConflicts() const { return ConflictCombiners; } + + bool operator<(const CombinerSolution &Other) const; + + bool operator==(const CombinerSolution &Other) const; +}; + +raw_ostream &operator<<(raw_ostream &OS, const CombinerSolution &Val); + +/// Helper Class that contains all the Combiners that have overlapping clustered +/// Registers. +class CombineCandidates { + + std::vector Combiners; + + /// \return An initial greedy Solution + CombinerSolution getGreedySolution() const; + + /// \return Maximum potential gain starting with the Solution \p Current and + /// searching through the Combiner starting at \p Index + CombinerGain getMaxPotentialGain(const CombinerSolution &Current, + const unsigned Index) const; + +public: + /// Add \p Combiner + void append(GenericCombiner *Combiner); + + /// \return Combiners from \p OwnedCombineCandidates that maximize the gain + /// when applied + std::vector + searchCombinerSet(const std::vector> + &OwnedCombineCandidates); + + /// Filtering out all Conflicts with \p UsedCombiners + void filterOut(const std::vector &UsedCombiners); + + /// Clear internal data + void clear(); +}; + +class AIEGlobalCombiner { + const MachineRegisterInfo *MRI = nullptr; + const AIEBaseInstrInfo *TII = nullptr; + const MachineDominatorTree *MDT = nullptr; + /// List of all template Combiners + const std::vector CombinerOptions; + /// Map between Base Register and Registers that derive from the BaseRegister + std::map> BaseRegisterMap; + + /// DAG of the MBB + AIE::DataDependenceHelper &DAG; + /// Clustered overlapping Combiner Candidates + std::map ClusteredCombiners; + /// Owner of the CombineCandidates + std::vector> OwnedCombineCandidates; + + /// Generate all Combiner possible + void generateCombiners(MachineBasicBlock &MBB); + + /// Search for the best combiner set that maximizes the gain from the + /// Combiners + std::vector findBeneficialCombiners(); + + /// Reverse Address Chaining effect to find out the base Pointer of this + /// Combiner + /// \return Register of the Base Pointer + Register getClusterBaseRegister(GenericCombiner &Combiner); + + /// Set instructions to move up or down to the InsertionPoint for each + /// Combiner in \p Combiners + void setMovableInstrs(std::vector &Combiners); + + /// Selected Combiners that help maximize the Combiner gain + std::vector FixedCombiners; + /// Index to current CombineCandidate to process + int CombineIdx = 0; + + /// clear combiner related temporary data + void clear(); + + /// Calculate all possible Combiners that are possible with \p CombineRoot and + /// \p Combiner + void calculateCombineCandidates(SUnit &CombineRoot, + const GenericCombiner *Combiner); + + void calculateCombinerConflicts(); + + /// \return CombineCandidates sorted by highest potential gain + std::vector getCombineCandidates( + std::map ClusteredCombiners); + +public: + AIEGlobalCombiner(const std::vector &Combiners, + const MachineDominatorTree &MDT, + AIE::DataDependenceHelper &DAG, + const MachineRegisterInfo *MRI, const AIEBaseInstrInfo *TII) + : MRI(MRI), TII(TII), MDT(&MDT), CombinerOptions(Combiners), DAG(DAG) {} + + std::vector getCombiners(MachineBasicBlock &MBB); +}; + +namespace GlobalCombiner { +/// Init the \p DAG for the \p MBB +void initDAG(AIE::DataDependenceHelper &DAG, MachineBasicBlock &MBB); +} // namespace GlobalCombiner + } // namespace llvm::AIE #endif // LLVM_LIB_TARGET_AIE_AIEGLOBALCOMBINER_H diff --git a/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.cpp b/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.cpp new file mode 100644 index 000000000000..bd13c1506483 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.cpp @@ -0,0 +1,458 @@ +//===--- AIEGlobalCombinerPtrMods.cpp - Global Pointer Modifier combiner --===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// Define Pre-Increment (Offset) and Post-Increment Combiners for the global +// combiner search. +// +//===----------------------------------------------------------------------===// + +#include "AIEGlobalCombinerPtrMods.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/Support/Debug.h" + +namespace llvm::AIE { +#define DEBUG_TYPE "global-combiner" + +const static int PtrModBits = 20; +namespace { +unsigned getLoadStoreSize(const MachineInstr &MI) { + // We are guaranteed to have MMOs during Instruction Selection. + // We need them to select the correct instruction when they depend on the + // size in memory and not on the register size. E.g.: part word stores. + return (*MI.memoperands_begin())->getSizeInBits().getValue(); +} + +std::optional getImm(const MachineInstr &PtrAdd, + const MachineRegisterInfo &MRI) { + assert(PtrAdd.getOpcode() == TargetOpcode::G_PTR_ADD); + auto OffsetMO = PtrAdd.getOperand(2); + std::optional Offset = + getIConstantVRegValWithLookThrough(OffsetMO.getReg(), MRI); + if (!Offset) + return {}; + + return APInt(PtrModBits, Offset->Value.getSExtValue(), /*isSigned=*/ + true); +} + +} // namespace + +const std::vector PointerModifierCombiner::getClusterRegs() const { + auto *PtrMod = getPtrInc(); + auto InputPtrIdx = PtrModSupport.getInputPtrIdx(*PtrMod); + assert(InputPtrIdx); + auto InputPtrReg = PtrMod->getOperand(*InputPtrIdx).getReg(); + + const auto OpIdx = PtrModSupport.getOutputPtrIdx(*PtrMod); + assert(OpIdx); + auto OuputPtrReg = PtrMod->getOperand(*OpIdx).getReg(); + return {InputPtrReg, OuputPtrReg}; +} + +std::vector PointerModifierCombiner::getCombineCandidates( + MachineInstr *MemI, const AIE::DataDependenceHelper &DAG) const { + const unsigned VecSize = + MRI->getType(MemI->getOperand(0).getReg()).getSizeInBits(); + if (VecSize > TII->getMaxSupportedLdStIncSize()) + return {}; + + return getPtrInstrs(MemI); +} + +UsageCount +PointerModifierCombiner::getUsageCount(Register Addr, + const MachineDominatorTree &MDT) const { + int UseDefsPastUseInstr = 0; + int PtrModCount = 0; + bool PHIUsageInSameMBB = false; + + const MachineInstr *PtrInc = getPtrInc(); + const auto *MBB = PtrInc->getParent(); + + for (auto &User : MRI->use_nodbg_instructions(Addr)) { + if (User.isPHI() && MBB == User.getParent()) { + PHIUsageInSameMBB = true; + continue; + } + + if (contains(User)) + continue; + + const auto *UserMBB = User.getParent(); + if (!MDT.dominates(MBB, UserMBB)) + continue; + + if (UserMBB == MBB) { + auto *UserSUnit = DAG->getSUnit(&User); + if (!UserSUnit) + continue; + if (UserSUnit->NodeNum < InsertionPointNodeNum) + continue; + } + + LLVM_DEBUG(dbgs() << "Checking " << User); + if (PtrModSupport.isNativeS20Consumer(User)) + PtrModCount += 1; + else + UseDefsPastUseInstr += 1; + } + + if (PHIUsageInSameMBB) + UseDefsPastUseInstr++; + + return {/*PtrModCount=*/PtrModCount, + /*NonPtrModCount=*/UseDefsPastUseInstr}; +} + +bool PointerModifierCombiner::canMove(const SUnit *Candidate, + const bool MoveDown) const { + // Moving another combiner may change the ordering of the final Combiners, + // thus overlap gain Calculation & conflicts should be made aware of the + // changes in the final Combiners ordering. + /// Fixme: Properly calculate Overlap-gain and conflicts if combiners are + /// moved and allow all instructions to be moved. + assert(Candidate); + if (!isCombineRootCandidate(Candidate->getInstr())) + return true; + + // Ordering of Memory Instructions has to be preserved. + // Note: Requires that the InsertionPoint is always the Memory Instruction + const bool MIBelowInsertion = Candidate->NodeNum > InsertionPointNodeNum; + return (MoveDown && MIBelowInsertion) || (!MoveDown && !MIBelowInsertion); +} + +bool PointerModifierCombiner::isCombineRootCandidate( + const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case TargetOpcode::G_STORE: + case TargetOpcode::G_LOAD: + case TargetOpcode::G_SEXTLOAD: + case TargetOpcode::G_ZEXTLOAD: + return true; + } + return false; +} + +const PtrModGain &PointerModifierCombiner::getGain() const { return Gain; } + +bool PointerModifierCombiner::setupCombiner( + std::vector CombineInstrs, + const AIE::DataDependenceHelper *DAG) { + setupCombineInstrs(CombineInstrs, DAG); + return true; +} + +void PointerModifierCombiner::setupCombineInstrs( + std::vector CombineInstrs, + const AIE::DataDependenceHelper *DAG) { + assert(CombineInstrs.size() == 2); + assert(DAG); + assert(CombineInstrs[1]->mayLoadOrStore()); + this->CombinerData.CombineInstrs = CombineInstrs; + + CombinerData.RemoveInstrs.resize(2); + if (RemovePtrMod) + // ptr modifier + CombinerData.RemoveInstrs.set(0); + + this->DAG = DAG; + for (auto *MI : this->CombinerData.CombineInstrs) { + auto *SUnit = DAG->getSUnit(MI); + if (!SUnit) { + assert(!MI->mayLoadOrStore()); + // if CombineInstrs exist outside of the current MBB, + // canReorderCombineInstrs should take care of removing invalid + // combiners, i.e. combiners, that need to move the + // outside-of-the-MBB-instruction + continue; + } + + CombineInstrNodeNum.push_back(SUnit->NodeNum); + } + + // set CombineRoot + CombinerData.CombineRoot = CombineInstrs[1]; + assert(isCombineRootCandidate(CombinerData.CombineRoot)); + CombineRootNodeNum = CombineInstrNodeNum.back(); + + setInsertionPoint(); +} + +void PointerModifierCombiner::setInsertionPoint() { + MachineInstr *MemI = getMemI(); + assert(MemI->mayLoadOrStore()); + CombinerData.InsertionPoint = MemI; + InsertionPointNodeNum = CombineInstrNodeNum.back(); +} + +const MachineInstr *PointerModifierCombiner::getPtrInc() const { + return CombinerData.CombineInstrs[0]; +} + +MachineInstr *PointerModifierCombiner::getPtrInc() { + return CombinerData.CombineInstrs[0]; +} + +bool PointerModifierCombiner::hasOverlapPenalty( + const GenericCombiner *Combiner) const { + // Fixme: consider Ordering in the Overlap Penalty calculation. + // Currently these two Variants have the same penalty, even though variant 1 + // should have a lower penalty, since Variant 2 has to copy %p0 twice: + // Variant 1: + // p1 = COPY p0 + // VLDA [p0], 64 + // VLDA [p1, 64] + // VLDA [p1, 128] + // Variant 2: + // p1 = COPY p0 + // VLDA [p0], 64 + // VLDA [p1, 64] + // p0 = COPY p1 + // VLDA [p0], 128 + // VLDA [p0, 64] + + if (!Combiner) + return false; + + const PointerModifierCombiner *PtrModCombiner = + static_cast(Combiner); + + if (PtrModCombiner->getPtrInc() != getPtrInc()) + // if combiners don't overlap, there cannot be an overlap penalty + return false; + + const bool NoRemoveInstrsCombiner = + Combiner->CombinerData.RemoveInstrs.none(); + if ((Combiner->CombinerData.RemoveInstrs.any() && + CombinerData.RemoveInstrs.any()) || + (NoRemoveInstrsCombiner && CombinerData.RemoveInstrs.none())) + // OverlapPenalty can only occur, if one Combiner has RemoveInstructions + // and the Other Combiner does not have Remove instructions + return false; + + const auto ToRemoveInsertionNodeNum = NoRemoveInstrsCombiner + ? InsertionPointNodeNum + : Combiner->InsertionPointNodeNum; + const auto NonRemoveInsertionNodeNum = NoRemoveInstrsCombiner + ? Combiner->InsertionPointNodeNum + : InsertionPointNodeNum; + + // overlap penalty occurs, if Non-remove Use occurs after a Remove Combiner + return ToRemoveInsertionNodeNum < NonRemoveInsertionNodeNum; +} + +CombinerGain PointerModifierCombiner::getOverlapPenalty( + const GenericCombiner *Combiner) const { + if (!hasOverlapPenalty(Combiner)) + return {}; + + LLVM_DEBUG(dbgs() << "Overlap Penalty\n"); + PtrModGain Obj; + Obj.setNoCopy(true); + return Obj; +} + +CombinerGain PointerModifierCombiner::getImmediateReuseGain( + const std::vector &UsedImmediates) const { + + bool ReuseImmediate = any_of(getAllImmInRegs(), [&](APInt &ImmInReg) { + return find(UsedImmediates, ImmInReg) != UsedImmediates.end(); + }); + + if (!ReuseImmediate) + return {}; + + LLVM_DEBUG(dbgs() << "Immediate Register Reuse, using Ideal Gain\n"); + PtrModGain Obj; + Obj.setValidImm(true); + return Obj; +} + +bool PointerModifierCombiner::tryToSetCombinedOpCode() { + auto OpCode = getOpCode(getPtrInc(), getMemI()); + if (!OpCode) + return false; + + CombinerData.CombinedInstrOpcode = *OpCode; + return true; +} + +// -------------------------- OffsetCombiner ---------------------------------// + +bool OffsetCombiner::isCombineCandidate(MachineInstr &CombineRoot, + MachineInstr &Candidate) const { + return getOpCode(&CombineRoot, &Candidate).has_value(); +} + +std::vector +OffsetCombiner::getPtrInstrs(MachineInstr *MemI) const { + assert(MemI->mayLoadOrStore()); + const int PtrIdx = 1; + const auto InputPtrReg = MemI->getOperand(PtrIdx).getReg(); + MachineOperand *PtrOrigin = MRI->getOneDef(InputPtrReg); + if (!PtrOrigin) + return {}; + + auto *PtrInc = PtrOrigin->getParent(); + if (!getOpCode(PtrInc, MemI)) + return {}; + + return {PtrInc}; +} + +std::unique_ptr OffsetCombiner::clone() const { + return std::make_unique(*this); +} + +std::optional, std::vector>> +OffsetCombiner::getInstructionsToMove(const AIE::DataDependenceHelper &DAG) { + return {{/*MoveUp*/ {}, /*MoveDown*/ {}}}; +} + +void OffsetCombiner::adjustGain(const MachineDominatorTree &MDT) { + const auto *PtrAdd = getPtrInc(); + assert(PtrAdd->getOpcode() == TargetOpcode::G_PTR_ADD); + + const auto DefAddr = PtrAdd->getOperand(0).getReg(); + // If another pointer modifier occurs, discount the gain. + // If there are only uses of the PtrAdd Instruction that do not modify the + // pointer, we do not have to discount the gain, since the Uses can be Offset + // Load/Stores. + auto UsageCounter = getUsageCount(DefAddr, MDT); + if (UsageCounter.PtrModCount > 0) { + LLVM_DEBUG(dbgs() << "Encountered PtrMod \n"); + // PtrMod cannot be removed, because the result is still needed in another + // PtrMod + Gain.setPtrMod(0); + } + + std::optional ImmOffset = getImm(*PtrAdd, *MRI); + if (!ImmOffset) + return; + + if (!TII->isOffsetInImmediateRange( + getCombinedOpCode(), getLoadStoreSize(*CombinerData.CombineRoot), + ImmOffset)) { + // do not add ImmOffset to ImmInRegs + // Multiple ptr_adds with constant immediates may be merged into a single + // ptr_add with the sum of combined Immediates. This happens if Address + // chaining is enabled and Preincrements are used as combiners. In this + // specific case, these ImmOffsets are only a suggestion, rather than the + // actual final Immediate Value. + LLVM_DEBUG(dbgs() << "no valid imm range!\n"); + Gain.setValidImm(false); + return; + } +} + +std::optional OffsetCombiner::getOpCode(MachineInstr *PtrInc, + MachineInstr *MemI) const { + assert(TII); + if (PtrInc->getOpcode() != TargetOpcode::G_PTR_ADD) + return {}; + + return TII->getOffsetMemOpcode(MemI->getOpcode()); +} + +// -------------------------- PostIncCombiner --------------------------------// + +bool PostIncCombiner::isCombineCandidate(MachineInstr &MemI, + MachineInstr &Candidate) const { + auto MemSize = MRI->getType(MemI.getOperand(0).getReg()).getSizeInBits(); + return TII->getCombinedPostIncOpcode(MemI, Candidate, MemSize).has_value(); +} + +bool PostIncCombiner::setupCombiner(std::vector CombineInstrs, + const AIE::DataDependenceHelper *DAG) { + setupCombineInstrs(CombineInstrs, DAG); + assert(CombinerData.CombineInstrs.size() == 2); + + const MachineInstr *PtrInc = getPtrInc(); + const MachineInstr *MemI = getMemI(); + if (PtrInc->getParent() != MemI->getParent()) + /// Combiner spans across multiple MBBs + return false; + + UserIntrinsic = PtrInc->getOpcode() != TargetOpcode::G_PTR_ADD; + CombinerData.Name = UserIntrinsic ? "UserIntrinsic" : "PostInc"; + + return true; +} + +std::unique_ptr PostIncCombiner::clone() const { + return std::make_unique(*this); +} + +void PostIncCombiner::adjustGain(const MachineDominatorTree &MDT) { + const auto *PtrAdd = getPtrInc(); + + auto InputPtrIdx = PtrModSupport.getInputPtrIdx(*PtrAdd); + assert(InputPtrIdx); + + // Input pointer may be copied in later usages, penalize post-inc gain + Gain.GainVector[2] = 0; + if (UserIntrinsic) + /// prioritize user intrinsics + Gain.setPtrMod(2); + + if (PtrAdd->getOpcode() != TargetOpcode::G_PTR_ADD) + return; + + std::optional ImmOffset = getImm(*PtrAdd, *MRI); + if (!ImmOffset) + return; + + if (!TII->isOffsetInImmediateRange( + getCombinedOpCode(), getLoadStoreSize(*CombinerData.CombineRoot), + ImmOffset)) { + ImmInRegs.push_back(*ImmOffset); + LLVM_DEBUG(dbgs() << "no valid imm range!\n"); + Gain.setValidImm(false); + } +} + +std::vector +PostIncCombiner::getPtrInstrs(MachineInstr *CombineRoot) const { + assert(isCombineRootCandidate(CombineRoot)); + std::vector ResultInstr; + // Load/Store have the input pointer at operand[1] + const int PtrIdx = 1; + Register PtrReg = CombineRoot->getOperand(PtrIdx).getReg(); + for (auto &Use : MRI->use_nodbg_instructions(PtrReg)) { + if (isCombineCandidate(*CombineRoot, Use)) + ResultInstr.push_back(&Use); + } + + return ResultInstr; +} + +std::optional PostIncCombiner::getOpCode(MachineInstr *PtrInc, + MachineInstr *MemI) const { + assert(TII); + return TII->getCombinedPostIncOpcode( + *MemI, *PtrInc, + MRI->getType(getMemI()->getOperand(0).getReg()).getSizeInBits()); +} + +// -------------------------- PtrModGain -------------------------------------// + +void PtrModGain::setPtrMod(const int Value) { GainVector[0] = Value; } + +void PtrModGain::setValidImm(const bool ValidImm) { + GainVector[1] = ValidImm ? 1 : 0; +} + +void PtrModGain::setNoCopy(const bool NoCopy) { + GainVector[2] = NoCopy ? 1 : 0; +} + +} // namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.h b/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.h new file mode 100644 index 000000000000..f4d0612a32a5 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.h @@ -0,0 +1,183 @@ +//===--- AIEGlobalCombinerPtrMods.h - Global Pointer Modifier combiner ----===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// Define Pre-Increment (Offset) and Post-Increment Combiners for the global +// combiner search. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIEGLOBALCOMBINERPTRMODS_H +#define LLVM_LIB_TARGET_AIE_AIEGLOBALCOMBINERPTRMODS_H +#include "AIEGlobalCombiner.h" +namespace llvm::AIE { + +struct UsageCount { + /// How many Ptr Modifier Instructions were encountered (i.e. read and write + /// Instruction count) + int PtrModCount = 0; + /// How many non-Pointer-Modifier Instructions were encountered (i.e. read + /// only Instruction count) + int NonPtrModCount = 0; +}; + +class PtrModGain : public CombinerGain { +public: + PtrModGain() : CombinerGain() {} + PtrModGain(std::initializer_list InitialGain) + : CombinerGain(InitialGain) {} + PtrModGain(const PtrModGain &Other) = default; + + ~PtrModGain() = default; + + /// Set \p Value to PtrMod Position in Gain + void setPtrMod(const int Value); + /// Set Valid Imm Value based on \p ValidImm . + void setValidImm(const bool ValidImm); + /// Set NoCopy Value based on \p NoCopy . + void setNoCopy(const bool NoCopy); +}; + +class PointerModifierCombiner : public GenericCombiner { +protected: + PtrModGain Gain; + + void setupCombineInstrs(std::vector CombineInstrs, + const AIE::DataDependenceHelper *DAG); + + bool hasOverlapPenalty(const GenericCombiner *Combiner) const; + + CombinerGain + getOverlapPenalty(const GenericCombiner *Combiner) const override; + + CombinerGain getImmediateReuseGain( + const std::vector &UsedImmediates) const override; + + bool canMove(const SUnit *Candidate, const bool MoveDown) const override; + + virtual std::optional getOpCode(MachineInstr *PtrInc, + MachineInstr *MemI) const = 0; + +public: + using GenericCombiner::GenericCombiner; + const AIEBaseInstrInfo::PTRModSupport &PtrModSupport; + const MachineRegisterInfo *MRI = nullptr; + const AIEBaseInstrInfo *TII = nullptr; + const AIE::DataDependenceHelper *DAG = nullptr; + bool RemovePtrMod = false; + bool ReplacePtrModInstr = false; + + PointerModifierCombiner(bool RemoveInstr, bool ReplaceInputPtr, + const MachineRegisterInfo *MRI, + const AIEBaseInstrInfo *TII, StringRef Name) + : GenericCombiner(Name), Gain({1, 1, 1}), + PtrModSupport(TII->getPTRModSupport()), MRI(MRI), TII(TII), + RemovePtrMod(RemoveInstr), ReplacePtrModInstr(ReplaceInputPtr) {} + + ~PointerModifierCombiner() = default; + + const MachineInstr *getPtrInc() const; + MachineInstr *getPtrInc(); + + MachineInstr *getMemI() { return CombinerData.CombineInstrs[1]; } + const MachineInstr *getMemI() const { return CombinerData.CombineInstrs[1]; } + + /// \return the Gain that applying the combiner would incurr + const PtrModGain &getGain() const override; + + void setInsertionPoint() override; + + bool setupCombiner(std::vector CombineInstrs, + const AIE::DataDependenceHelper *DAG) override; + + virtual std::vector getPtrInstrs(MachineInstr *MI) const = 0; + + /// \return Input and Output Pointer Registers + const std::vector getClusterRegs() const override; + + virtual bool isCombineCandidate(MachineInstr &CombineRoot, + MachineInstr &Candidate) const = 0; + + std::vector + getCombineCandidates(MachineInstr *MemI, + const AIE::DataDependenceHelper &DAG) const override; + + /// \return Usage Count of \p Addr non debug users after the Combiners' + /// Insertion Point + UsageCount getUsageCount(Register Addr, + const MachineDominatorTree &MDT) const; + + /// \return whether \p MI is a Memory Instruction + bool isCombineRootCandidate(const MachineInstr *MI) const override; + + /// \return whether Opcode can be set + bool tryToSetCombinedOpCode() override; +}; + +class OffsetCombiner : public PointerModifierCombiner { +protected: + std::optional getOpCode(MachineInstr *PtrInc, + MachineInstr *MemI) const override; + +public: + using PointerModifierCombiner::PointerModifierCombiner; + OffsetCombiner(const MachineRegisterInfo *MRI, const AIEBaseInstrInfo *TII) + : PointerModifierCombiner(false, false, MRI, TII, "Offset") {} + + bool isCombineCandidate(MachineInstr &CombineRoot, + MachineInstr &Candidate) const override; + + std::vector getPtrInstrs(MachineInstr *MI) const override; + + std::unique_ptr clone() const override; + + void adjustGain(const MachineDominatorTree &MDT) override; + + std::optional, std::vector>> + getInstructionsToMove(const AIE::DataDependenceHelper &DAG) override; +}; + +class PostIncCombiner : public PointerModifierCombiner { + + bool isPostIncCandidate(const MachineInstr *PtrMod, + const MachineRegisterInfo &MRI) const; + + bool UserIntrinsic = false; + +protected: + std::optional getOpCode(MachineInstr *PtrInc, + MachineInstr *MemI) const override; + +public: + using PointerModifierCombiner::PointerModifierCombiner; + + PostIncCombiner(const MachineRegisterInfo *MRI, const AIEBaseInstrInfo *TII) + : PointerModifierCombiner(true, true, MRI, TII, /*Name=*/"PostInc") {} + + // Constructor for derived Classes + PostIncCombiner(bool ReplaceInstr, const MachineRegisterInfo *MRI, + const AIEBaseInstrInfo *TII, StringRef Name) + : PointerModifierCombiner(ReplaceInstr, ReplaceInstr, MRI, TII, Name) {} + + bool isCombineCandidate(MachineInstr &CombineRoot, + MachineInstr &Candidate) const override; + + bool setupCombiner(std::vector CombineInstrs, + const AIE::DataDependenceHelper *DAG) override; + + std::unique_ptr clone() const override; + + void adjustGain(const MachineDominatorTree &MDT) override; + + std::vector getPtrInstrs(MachineInstr *MI) const override; +}; + +} // namespace llvm::AIE + +#endif // LLVM_LIB_TARGET_AIE_AIEGLOBALCOMBINERPTRMODS_H diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h index 7bcd6795ce87..7e3cdee10ba0 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.h +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.h @@ -44,8 +44,12 @@ class DataDependenceHelper : public ScheduleDAGInstrs { void schedule() override{}; public: - DataDependenceHelper(const MachineSchedContext &Context) + DataDependenceHelper(const MachineSchedContext &Context, + bool AddMutators = true) : ScheduleDAGInstrs(*Context.MF, Context.MLI), Context(Context) { + if (!AddMutators) + return; + auto &Subtarget = Context.MF->getSubtarget(); auto TT = Subtarget.getTargetTriple(); for (auto &M : AIEBaseSubtarget::getInterBlockMutationsImpl(TT)) { diff --git a/llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp b/llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp index dbd245a216f9..029b6a4688df 100644 --- a/llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp +++ b/llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp @@ -17,6 +17,7 @@ #include "AIE.h" #include "AIEBaseInstrInfo.h" #include "AIEGlobalCombiner.h" +#include "AIEGlobalCombinerPtrMods.h" #include "AIEInterBlockScheduling.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" @@ -45,7 +46,45 @@ namespace llvm { bool AIEPtrModOptimizer::runOnMachineFunction(MachineFunction &MF) { PtrModRes = std::make_unique( - /*Analysis=*/false); + /*Analysis=*/true); + + MachineRegisterInfo &MRI = MF.getRegInfo(); + const auto *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + + const MachineDominatorTree *MDT = &getAnalysis(); + + MachineSchedContext Context; + Context.MF = &MF; + Context.AA = &getAnalysis().getAAResults(); + + // To build the edges in the DAG, the reserved Registers have to be freezed + MRI.freezeReservedRegs(); + AIE::DataDependenceHelper DAG(Context, /*AddMutators=*/false); + + // Fixme: these combiners should be provided by tablegen + std::vector Combiners; + auto OffsetCombiner = std::make_unique(&MRI, TII); + Combiners.push_back(OffsetCombiner.get()); + auto PostInc = std::make_unique(&MRI, TII); + Combiners.push_back(PostInc.get()); + AIE::AIEGlobalCombiner GlobalCombinerHelper(Combiners, *MDT, DAG, &MRI, TII); + + for (auto &MBB : MF) { + LLVM_DEBUG(dbgs() << "\n\n\n New MBB:" << MBB.getName() << " (" + << MBB.getParent()->getName() << ")\n\n"); + + std::vector FoundCombiners = + GlobalCombinerHelper.getCombiners(MBB); + + if (FoundCombiners.empty()) { + LLVM_DEBUG(dbgs() << "[Global Ptr Inc] Skipping. No Combiners found!\n"); + continue; + } + + LLVM_DEBUG(dbgs() << "\n[Solution] MBB : " << MBB.getName() << "\n"); + appendResult(FoundCombiners); + } return false; } diff --git a/llvm/lib/Target/AIE/AIETargetMachine.cpp b/llvm/lib/Target/AIE/AIETargetMachine.cpp index 75227f557e1e..1b7bd1967a79 100644 --- a/llvm/lib/Target/AIE/AIETargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIETargetMachine.cpp @@ -59,6 +59,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAIETarget() { initializeAIEAddressSpaceFlatteningPass(*PR); initializeAIEEliminateDuplicatePHIPass(*PR); initializeAIEClusterBaseAddressPass(*PR); + initializeAIEPtrModOptimizerPass(*PR); initializeAIE2PreLegalizerCombinerPass(*PR); initializeAIE2PPreLegalizerCombinerPass(*PR); initializeAIE2PostLegalizerGenericCombinerPass(*PR); diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt index f69d955fd6a9..2b286706530d 100644 --- a/llvm/lib/Target/AIE/CMakeLists.txt +++ b/llvm/lib/Target/AIE/CMakeLists.txt @@ -99,6 +99,7 @@ add_llvm_target(AIECodeGen AIEFinalizeBundle.cpp AIEFrameLowering.cpp AIEGlobalCombiner.cpp + AIEGlobalCombinerPtrMods.cpp AIEHazardRecognizer.cpp AIEHazardRecognizerPRAS.cpp AIEInstrInfo.cpp diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PPostLegalizerCustomCombiner.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PPostLegalizerCustomCombiner.cpp index 85412f68db7d..1da355716a53 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PPostLegalizerCustomCombiner.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PPostLegalizerCustomCombiner.cpp @@ -32,6 +32,8 @@ using namespace llvm; +extern cl::opt EnableGlobalPtrModOptimizer; + static const char AIE2P_POSTLEGALIZER_CUSTOM_COMBINER[] = "AIE2P Post Legalizer Custom Combiner"; @@ -112,6 +114,9 @@ class AIE2PPostLegalizerCustomCombiner : public MachineFunctionPass { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + if (EnableGlobalPtrModOptimizer) { + AU.addRequired(); + } MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/test/CodeGen/AIE/GlobalISel/addrspace-before-selection.ll b/llvm/test/CodeGen/AIE/GlobalISel/addrspace-before-selection.ll index 32d7c8a3b094..71bdbf3af29a 100644 --- a/llvm/test/CodeGen/AIE/GlobalISel/addrspace-before-selection.ll +++ b/llvm/test/CodeGen/AIE/GlobalISel/addrspace-before-selection.ll @@ -4,23 +4,23 @@ ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ; ; (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -; RUN: llc -mtriple=aie2 -stop-before=instruction-select %s -o - 2>&1 | FileCheck %s +; RUN: llc -mtriple=aie2 -stop-after=instruction-select %s -o - 2>&1 | FileCheck %s ; Test if addrspace is correctly propagated after transformations, like memory op. ; split. +; Note: Global Combiner does not legalize non-legal load/stores, therefore, +; unit-test has to stop after instruction-legalization, to show the legalized load instructions. define dso_local noundef<16 x i32> @addrspace_propagation(ptr addrspace(6) nocapture readonly %ptr) local_unnamed_addr #0 { ; CHECK-LABEL: name: addrspace_propagation ; CHECK: bb.1.entry: ; CHECK-NEXT: liveins: $p0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:ptrregbank(p0) = COPY $p0 - ; CHECK-NEXT: [[C:%[0-9]+]]:modregbank(s20) = G_CONSTANT i20 128 - ; CHECK-NEXT: [[C1:%[0-9]+]]:modregbank(s20) = G_CONSTANT i20 160 - ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (<8 x s32>) from %ir.arrayidx.1 + 32, addrspace 6) - ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:vregbank(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<8 x s32>) from %ir.arrayidx.1, addrspace 6) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vregbank(<16 x s32>) = G_CONCAT_VECTORS [[AIE_OFFSET_LOAD1]](<8 x s32>), [[AIE_OFFSET_LOAD]](<8 x s32>) - ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 160 :: (load (<8 x s32>) from %ir.arrayidx.1 + 32, addrspace 6) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 128 :: (load (<8 x s32>) from %ir.arrayidx.1, addrspace 6) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi + ; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE]] ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 entry: %arrayidx.1 = getelementptr inbounds [16 x <16 x i32>], ptr addrspace(6) %ptr, i32 0, i32 2 diff --git a/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir b/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir index 866bf5ed4751..750641bbcc03 100644 --- a/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir +++ b/llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir @@ -43,8 +43,9 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20) - ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[TRUNC]](s20) :: (load (s32)) - ; CHECK-NEXT: $r0 = COPY [[AIE_OFFSET_LOAD]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p0) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) %0:_(p0) = COPY $p0 %1:_(s32) = G_CONSTANT i32 24 @@ -161,9 +162,7 @@ body: | ... -# Our current combine code is not able to move the memory operation up. In this -# case we cannot just move the pointer add to the load and we therefore don't -# combine. This could be improved. +# Move PTR_ADD Instruction down to the Memory Instruction. --- name: load_not_to_postinc_ptradd_before_load body: | @@ -175,10 +174,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20) - ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32)) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) %0:_(p0) = COPY $p0 %1:_(s32) = G_CONSTANT i32 24 %2:_(s20) = G_TRUNC %1 @@ -220,12 +218,11 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20) - ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) %0:_(p0) = COPY $p0 %3:_(s32) = G_LOAD %0 :: (load (s32)) $r0 = COPY %3 @@ -352,9 +349,7 @@ body: | $p0 = COPY %3 ... -# Our current combine code is not able to move the memory operation up. In this -# case we cannot just move the pointer add to the store and we therefore don't -# combine. This could be improved. +# Move PTR_ADD Instruction down to the Memory Instruction. --- name: store_not_to_postinc_ptr_add_before body: | @@ -366,9 +361,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 24 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) - ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[COPY1]](s32), [[COPY]], [[C]](s20) :: (store (s32)) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_STORE]](p0) %0:_(p0) = COPY $p0 %1:_(s32) = COPY $r0 %2:_(s20) = G_CONSTANT i20 24 @@ -430,13 +424,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0 - ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p0) :: (store (s32)) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $p1 - ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY2]](p0) :: (store (s32)) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20) - ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[COPY1]](s32), [[COPY]], [[TRUNC]](s20) :: (store (s32)) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $p1 + ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY2]](p0) :: (store (s32)) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_STORE]](p0) %0:_(p0) = COPY $p0 %1:_(s32) = COPY $r0 G_STORE %1, %0 :: (store (s32)) @@ -618,10 +611,9 @@ body: | ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20) - ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32)) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) %1:_(s32) = G_CONSTANT i32 24 %4:_(s20) = G_TRUNC %1 %0:_(p0) = COPY $p0 @@ -639,12 +631,12 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32)) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s20) = G_TRUNC [[C1]](s32) - ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC1]](s20) :: (load (s32)) - ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) - ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC1]](s20) + ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_LOAD1]](p0) ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) %2:_(p0) = COPY $p0 %7:_(s32) = G_LOAD %2 :: (load (s32)) @@ -931,15 +923,10 @@ body: | ; CHECK-LABEL: name: preinc_combine_vectors_512_bits ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 96 - ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (<16 x s16>) from unknown-address + 32) - ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<32 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<32 x s16>)) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p0 - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_OFFSET_LOAD1]](<16 x s16>) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) - ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY3]](<16 x s16>), [[COPY1]](p0), [[C1]](s20) :: (store (<16 x s16>) into unknown-address + 32) - ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY2]](<16 x s16>), [[COPY1]](p0), [[C2]](s20) :: (store (<16 x s16>), align 64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[AIE_OFFSET_LOAD]](<32 x s16>), [[COPY1]](p0), [[C1]](s20) :: (store (<32 x s16>)) %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 %3:_(p0) = G_PTR_ADD %0, %1 @@ -984,13 +971,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s20) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0), implicit [[PTR_ADD1]](p0), implicit [[COPY]](p0) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0), implicit [[PTR_ADD]](p0), implicit [[COPY]](p0) bb.0: %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 @@ -1015,9 +1001,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) ; CHECK-NEXT: PseudoJNZ $r1, %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: @@ -1032,7 +1017,7 @@ body: | ; CHECK-NEXT: PseudoJ_jump_imm %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0) bb.0: %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 @@ -1061,9 +1046,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) ; CHECK-NEXT: PseudoJNZ $r1, %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: @@ -1078,7 +1062,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: $p0 = COPY [[COPY]](p0) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0) bb.0: %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 @@ -1160,9 +1144,8 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) ; CHECK-NEXT: PseudoJ_jump_imm %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -1172,7 +1155,7 @@ body: | ; CHECK-NEXT: PseudoJ_jump_imm %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0) bb.0: %0:_(p0) = COPY $p0 PseudoJNZ $r1, %bb.2 @@ -1206,9 +1189,8 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) ; CHECK-NEXT: PseudoJ_jump_imm %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -1218,7 +1200,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: $p0 = COPY [[COPY]](p0) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0) bb.0: %0:_(p0) = COPY $p0 PseudoJNZ $r1, %bb.2 @@ -1309,10 +1291,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0) bb.0: %0:_(p0) = COPY $p0 PseudoJNZ $r1, %bb.2 @@ -1342,11 +1323,11 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<32 x s8>)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s8>), [[COPY1]], [[C]](s20) :: (store (<32 x s8>)) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20) - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s8>), [[COPY1]], [[C1]](s20) :: (store (<32 x s8>)) - ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0) - ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_STORE]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_STORE]](p0) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C1]](s20) + ; CHECK-NEXT: $p2 = COPY [[PTR_ADD]](p0) ; CHECK-NEXT: $wl0 = COPY [[AIE_POSTINC_LOAD]](<32 x s8>) %0:_(p0) = COPY $p0 %6:_(p0) = COPY $p1 @@ -1372,11 +1353,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s8>)) + ; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[AIE_POSTINC_2D_LOAD]](<32 x s8>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s8>)) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_LOAD1]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_2D_STORE]](p0) ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2.add.2d), [[COPY1]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) - ; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[AIE_POSTINC_2D_LOAD]](<32 x s8>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s8>)) - ; CHECK-NEXT: $p1 = COPY [[INT]](p0) - ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_2D_STORE]](p0) + ; CHECK-NEXT: $p2 = COPY [[INT]](p0) ; CHECK-NEXT: $wl0 = COPY [[AIE_POSTINC_2D_LOAD]](<32 x s8>) %0:_(p0) = COPY $p0 %6:_(p0) = COPY $p1 @@ -1401,11 +1382,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s8>)) + ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[AIE_POSTINC_3D_LOAD]](<32 x s8>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s8>)) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_LOAD1]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_3D_STORE]](p0) ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2.add.3d), [[COPY1]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) - ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[AIE_POSTINC_3D_LOAD]](<32 x s8>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s8>)) - ; CHECK-NEXT: $p1 = COPY [[INT]](p0) - ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_3D_STORE]](p0) + ; CHECK-NEXT: $p2 = COPY [[INT]](p0) ; CHECK-NEXT: $wl0 = COPY [[AIE_POSTINC_3D_LOAD]](<32 x s8>) %0:_(p0) = COPY $p0 %6:_(p0) = COPY $p1 @@ -1457,17 +1438,12 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 - ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C2]](s20) :: (load (<16 x s16>) from unknown-address + 32) - ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<16 x s16>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<16 x s16>), align 64) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_POSTINC_LOAD]](<16 x s16>) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) - ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY3]](<16 x s16>), [[COPY1]](p0), [[C2]](s20) :: (store (<16 x s16>) into unknown-address + 32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<32 x s16>)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s16>), [[COPY1]], [[C]](s20) :: (store (<32 x s16>)) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20) - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[COPY2]](<16 x s16>), [[COPY1]], [[C1]](s20) :: (store (<16 x s16>), align 64) - ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0) - ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_STORE]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_STORE]](p0) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C1]](s20) + ; CHECK-NEXT: $p2 = COPY [[PTR_ADD]](p0) %0:_(p0) = COPY $p0 %6:_(p0) = COPY $p1 %1:_(s20) = G_CONSTANT i20 64 @@ -1490,13 +1466,8 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 - ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (<16 x s16>) from unknown-address + 32) - ; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<16 x s16>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]] :: (load (<16 x s16>), align 64) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_POSTINC_2D_LOAD]](<16 x s16>) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) - ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY3]](<16 x s16>), [[COPY1]](p0), [[C1]](s20) :: (store (<16 x s16>) into unknown-address + 32) - ; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[COPY2]](<16 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<16 x s16>), align 64) + ; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s16>)) + ; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[AIE_POSTINC_2D_LOAD]](<32 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s16>)) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_LOAD1]](p0) ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_2D_STORE]](p0) %0:_(p0) = COPY $p0 @@ -1518,13 +1489,8 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 - ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (<16 x s16>) from unknown-address + 32) - ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<16 x s16>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (<16 x s16>), align 64) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_POSTINC_3D_LOAD]](<16 x s16>) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<16 x s16>) = COPY [[AIE_OFFSET_LOAD]](<16 x s16>) - ; CHECK-NEXT: G_AIE_OFFSET_STORE [[COPY3]](<16 x s16>), [[COPY1]](p0), [[C1]](s20) :: (store (<16 x s16>) into unknown-address + 32) - ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[COPY2]](<16 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<16 x s16>), align 64) + ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s16>)) + ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[AIE_POSTINC_3D_LOAD]](<32 x s16>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s16>)) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_LOAD1]](p0) ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_3D_STORE]](p0) %0:_(p0) = COPY $p0 diff --git a/llvm/test/CodeGen/AIE/GlobalISel/combine-split-large-ls.mir b/llvm/test/CodeGen/AIE/GlobalISel/combine-split-large-ls.mir index 1add9805febd..44b10a9ac536 100644 --- a/llvm/test/CodeGen/AIE/GlobalISel/combine-split-large-ls.mir +++ b/llvm/test/CodeGen/AIE/GlobalISel/combine-split-large-ls.mir @@ -8,11 +8,13 @@ # RUN: llc -mtriple aie2 -run-pass=aie2-postlegalizer-custom-combiner %s -verify-machineinstrs -o - | FileCheck %s # Test for combine_load_store_split +# FIXME: Global Combiner does not evaluate if the Load has to be split into smaller loads with offsets. # Case 1: Can split. --- name: load_16xs32 +tracksRegLiveness: true body: | bb.0: liveins: $p0 @@ -21,9 +23,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 - ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<8 x s32>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<8 x s32>) from unknown-address + 32) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<8 x s32>), align 64) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<8 x s32>), [[AIE_OFFSET_LOAD]](<8 x s32>) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<8 x s32>) from unknown-address + 32) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<8 x s32>), align 64) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD1]](<8 x s32>), [[LOAD]](<8 x s32>) ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) %1:_(p0) = COPY $p0 %295:_(<16 x s32>) = G_LOAD %1(p0) :: (load (<16 x s32>)) @@ -42,9 +45,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 - ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<16 x s16>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<16 x s16>) from unknown-address + 32) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<16 x s16>), align 64) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[LOAD]](<16 x s16>), [[AIE_OFFSET_LOAD]](<16 x s16>) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[PTR_ADD]](p0) :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[LOAD1]](<16 x s16>), [[LOAD]](<16 x s16>) ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<32 x s16>) %1:_(p0) = COPY $p0 %295:_(<32 x s16>) = G_LOAD %1(p0) :: (load (<32 x s16>)) @@ -63,9 +67,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 - ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<32 x s8>) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (<32 x s8>) from unknown-address + 32) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<32 x s8>), align 64) - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s8>) = G_CONCAT_VECTORS [[LOAD]](<32 x s8>), [[AIE_OFFSET_LOAD]](<32 x s8>) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<32 x s8>) = G_LOAD [[PTR_ADD]](p0) :: (load (<32 x s8>) from unknown-address + 32) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<32 x s8>), align 64) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s8>) = G_CONCAT_VECTORS [[LOAD1]](<32 x s8>), [[LOAD]](<32 x s8>) ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<64 x s8>) %1:_(p0) = COPY $p0 %295:_(<64 x s8>) = G_LOAD %1(p0) :: (load (<64 x s8>)) @@ -86,7 +91,8 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 32 - ; CHECK-NEXT: G_AIE_OFFSET_STORE [[UV1]](<8 x s32>), [[COPY1]](p0), [[C]](s20) :: (store (<8 x s32>) into unknown-address + 32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20) + ; CHECK-NEXT: G_STORE [[UV1]](<8 x s32>), [[PTR_ADD]](p0) :: (store (<8 x s32>) into unknown-address + 32) ; CHECK-NEXT: G_STORE [[UV]](<8 x s32>), [[COPY1]](p0) :: (store (<8 x s32>), align 64) %0:_(<16 x s32>) = COPY $x0 %1:_(p0) = COPY $p0 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir index 974f2901e8eb..3c4ad2a313f9 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/indexed-512-load-store.mir @@ -7,13 +7,15 @@ # (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates # RUN: llc -mtriple aie2 -start-before=aie2-postlegalizer-custom-combiner -stop-after=instruction-select %s -verify-machineinstrs -o - | FileCheck %s -# The way we currently select 512-bit offset memory operantions when the offset +# The way we currently select 512-bit offset memory operations when the offset # does not fit in the immediate range of the instruction leads to unnecessarily # selecting an identical PADD twice in certain situations. # For simpler tests CSE picks up on these cases and removes one copy of the PADD # further down the compilation pipeline, but in both examples below the # duplicated PADD reaches the final assembly. # TODO: This can and should be avoided! +# Hint: global combiners do not analyze legalized load/store instructions, +# therefore, individual offsets cannot be generated for the legalized load/store instructions --- name: load_offset_not_32_step @@ -27,16 +29,15 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:ep = COPY $p0 ; CHECK-NEXT: [[MOV_RLC_imm10_pseudo:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 24 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:edj = COPY [[MOV_RLC_imm10_pseudo]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:em = COPY [[COPY1]] - ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY2]] - ; CHECK-NEXT: [[MOV_PD_imm10_pseudo:%[0-9]+]]:edj = MOV_PD_imm10_pseudo 56 - ; CHECK-NEXT: [[VLD_idx_pseudo:%[0-9]+]]:ewh = VLD_idx_pseudo [[COPY]], [[MOV_PD_imm10_pseudo]] :: (load (<16 x s16>) from unknown-address + 32) - ; CHECK-NEXT: [[VLD_idx_pseudo1:%[0-9]+]]:ewl = VLD_idx_pseudo [[COPY]], [[COPY1]] :: (load (<16 x s16>), align 64) - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_pseudo1]], %subreg.sub_256_lo, [[VLD_idx_pseudo]], %subreg.sub_256_hi - ; CHECK-NEXT: [[MOV_PD_imm10_pseudo1:%[0-9]+]]:edj = MOV_PD_imm10_pseudo 48 - ; CHECK-NEXT: [[VLD_idx_pseudo2:%[0-9]+]]:vec256 = VLD_idx_pseudo [[COPY]], [[MOV_PD_imm10_pseudo1]] :: (load (<16 x s16>)) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]], implicit [[REG_SEQUENCE]], implicit [[VLD_idx_pseudo2]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:em = COPY [[MOV_RLC_imm10_pseudo]] + ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[PADD_mod_pseudo1:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[PADD_mod_pseudo1]], 32 :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm1:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[PADD_mod_pseudo1]], 0 :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLDA_dmw_lda_w_ag_idx_imm1]], %subreg.sub_256_lo, [[VLDA_dmw_lda_w_ag_idx_imm]], %subreg.sub_256_hi + ; CHECK-NEXT: [[MOV_PD_imm10_pseudo:%[0-9]+]]:edj = MOV_PD_imm10_pseudo 48 + ; CHECK-NEXT: [[VLD_idx_pseudo:%[0-9]+]]:vec256 = VLD_idx_pseudo [[COPY]], [[MOV_PD_imm10_pseudo]] :: (load (<16 x s16>)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]], implicit [[REG_SEQUENCE]], implicit [[VLD_idx_pseudo]] %0:_(p0) = COPY $p0 %1:_(s32) = G_CONSTANT i32 24 %2:_(s20) = G_TRUNC %1 @@ -61,8 +62,8 @@ body: | ; CHECK-NEXT: [[MOV_RLC_imm10_pseudo:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 64 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:em = COPY [[MOV_RLC_imm10_pseudo]] ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY1]] - ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:ewh = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<16 x s16>) from unknown-address + 32) - ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:ewl = VLD_idx_imm_3x32_pseudo [[COPY]], 64 :: (load (<16 x s16>), align 64) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 96 :: (load (<16 x s16>) from unknown-address + 32) + ; CHECK-NEXT: [[VLD_idx_imm_3x32_pseudo1:%[0-9]+]]:vec256 = VLD_idx_imm_3x32_pseudo [[COPY]], 64 :: (load (<16 x s16>), align 64) ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[VLD_idx_imm_3x32_pseudo1]], %subreg.sub_256_lo, [[VLD_idx_imm_3x32_pseudo]], %subreg.sub_256_hi ; CHECK-NEXT: [[VLDA_dmw_lda_w_ag_idx_imm:%[0-9]+]]:vec256 = VLDA_dmw_lda_w_ag_idx_imm [[COPY]], 128 :: (load (<16 x s16>)) ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]], implicit [[REG_SEQUENCE]], implicit [[VLDA_dmw_lda_w_ag_idx_imm]] @@ -90,16 +91,15 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY $x0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec256 = COPY $wl2 ; CHECK-NEXT: [[MOV_RLC_imm10_pseudo:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 24 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:edj = COPY [[MOV_RLC_imm10_pseudo]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:em = COPY [[COPY3]] - ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY4]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec256 = COPY [[COPY1]].sub_256_lo - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vec256 = COPY [[COPY1]].sub_256_hi - ; CHECK-NEXT: [[MOV_PD_imm10_pseudo:%[0-9]+]]:edj = MOV_PD_imm10_pseudo 56 - ; CHECK-NEXT: VST_dmw_sts_w_ag_idx [[COPY6]], [[COPY]], [[MOV_PD_imm10_pseudo]] :: (store (<16 x s16>) into unknown-address + 32) - ; CHECK-NEXT: VST_dmw_sts_w_ag_idx [[COPY5]], [[COPY]], [[COPY3]] :: (store (<16 x s16>), align 64) - ; CHECK-NEXT: [[MOV_PD_imm10_pseudo1:%[0-9]+]]:edj = MOV_PD_imm10_pseudo 48 - ; CHECK-NEXT: VST_dmw_sts_w_ag_idx [[COPY2]], [[COPY]], [[MOV_PD_imm10_pseudo1]] :: (store (<16 x s16>)) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:em = COPY [[MOV_RLC_imm10_pseudo]] + ; CHECK-NEXT: [[PADD_mod_pseudo:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY3]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vec256 = COPY [[COPY1]].sub_256_lo + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vec256 = COPY [[COPY1]].sub_256_hi + ; CHECK-NEXT: [[PADD_mod_pseudo1:%[0-9]+]]:ep = PADD_mod_pseudo [[COPY]], [[COPY3]] + ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY5]], [[PADD_mod_pseudo1]], 32 :: (store (<16 x s16>) into unknown-address + 32) + ; CHECK-NEXT: VST_dmw_sts_w_ag_idx_imm [[COPY4]], [[PADD_mod_pseudo1]], 0 :: (store (<16 x s16>), align 64) + ; CHECK-NEXT: [[MOV_PD_imm10_pseudo:%[0-9]+]]:edj = MOV_PD_imm10_pseudo 48 + ; CHECK-NEXT: VST_dmw_sts_w_ag_idx [[COPY2]], [[COPY]], [[MOV_PD_imm10_pseudo]] :: (store (<16 x s16>)) ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PADD_mod_pseudo]] %0:_(p0) = COPY $p0 %1:_(<32 x s16>) = COPY $x0 diff --git a/llvm/test/CodeGen/AIE/aie2/conv2d_offset_test.ll b/llvm/test/CodeGen/AIE/aie2/conv2d_offset_test.ll index 77f669ec8643..09e4fb215f2a 100644 --- a/llvm/test/CodeGen/AIE/aie2/conv2d_offset_test.ll +++ b/llvm/test/CodeGen/AIE/aie2/conv2d_offset_test.ll @@ -20,31 +20,29 @@ define dso_local noundef i32 @_Z3foov() #0 { ; CHECK-LABEL: _Z3foov: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nop ; movxm p1, #(X+92) -; CHECK-NEXT: mova m0, #-164 -; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: lda r0, [p1, #8] -; CHECK-NEXT: paddb [p0], #96 -; CHECK-NEXT: lda.u8 r1, [p0], m0 -; CHECK-NEXT: mova m0, #12 -; CHECK-NEXT: lda r1, [p1], #8 -; CHECK-NEXT: lda.u16 r1, [p0], m0 -; CHECK-NEXT: mova m0, #60 -; CHECK-NEXT: lda.u8 r1, [p0], m0 +; CHECK-NEXT: nopb ; nopa ; nops ; movxm p0, #(X+92); nopv +; CHECK-NEXT: mova dj0, #96 +; CHECK-NEXT: lda.u8 r1, [p0, dj0] +; CHECK-NEXT: mova dj0, #-68 +; CHECK-NEXT: lda r0, [p0, #8] +; CHECK-NEXT: lda r1, [p0, #0] +; CHECK-NEXT: lda.u16 r1, [p0, dj0] +; CHECK-NEXT: mova dj0, #-56 +; CHECK-NEXT: lda.u8 r1, [p0, dj0] ; CHECK-NEXT: nop -; CHECK-NEXT: lda r1, [p0], #76 +; CHECK-NEXT: lda r1, [p0, #4] ; CHECK-NEXT: add r0, r0, r1 ; CHECK-NEXT: add r0, r0, r1 ; CHECK-NEXT: nop ; CHECK-NEXT: add r0, r0, r1 ; CHECK-NEXT: add r0, r0, r1 -; CHECK-NEXT: lda r1, [p0], #12 +; CHECK-NEXT: lda r1, [p0, #80] ; CHECK-NEXT: add r0, r0, r1 -; CHECK-NEXT: lda r1, [p0], #-48 -; CHECK-NEXT: lda r1, [p0], #32 -; CHECK-NEXT: lda r1, [p1, #0] -; CHECK-NEXT: lda r1, [p0], #-136 -; CHECK-NEXT: lda r1, [p0, #0] +; CHECK-NEXT: lda r1, [p0, #92] +; CHECK-NEXT: lda r1, [p0, #44] +; CHECK-NEXT: lda r1, [p0, #8] +; CHECK-NEXT: lda r1, [p0, #76] +; CHECK-NEXT: lda r1, [p0, #-60] ; CHECK-NEXT: add r0, r0, r1 ; CHECK-NEXT: ret lr ; CHECK-NEXT: add r0, r0, r1 // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll index cab5733103b5..cd2c5cd026c1 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll @@ -35,50 +35,50 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-LABEL: add2d: ; ASM: .p2align 4 ; ASM-NEXT: // %bb.0: // %newFuncRoot -; ASM-NEXT: paddb [p0], #40; nopx -; ASM-NEXT: lda m2, [p0], #-4 -; ASM-NEXT: lda m3, [p0], #8 -; ASM-NEXT: lda m5, [p0], #8 -; ASM-NEXT: lda m4, [p0], #-24 -; ASM-NEXT: lda r4, [p0], #36; paddb [sp], #32 -; ASM-NEXT: lda r2, [p0], #-32; st p7, [sp, #-32] // 4-byte Folded Spill -; ASM-NEXT: lda r0, [p0], #-12; st p6, [sp, #-28] // 4-byte Folded Spill -; ASM-NEXT: lda r1, [p0], #40; mov p6, sp -; ASM-NEXT: paddb [p6], #-36; mov p7, sp -; ASM-NEXT: lda r5, [p6, #0]; paddb [p7], #-40 -; ASM-NEXT: lda p7, [p7, #0]; mov p6, sp -; ASM-NEXT: lda m1, [p0], #36; paddb [p6], #-44 -; ASM-NEXT: lda p6, [p6, #0] -; ASM-NEXT: lda m0, [p0], #-8 -; ASM-NEXT: lda dn0, [p0], #-8; st r1, [p4, #0] -; ASM-NEXT: lda dj0, [p0], #12; nez r3, r0; mov p4, sp -; ASM-NEXT: lda dn4, [p0], #-8; st r3, [p5, #0] -; ASM-NEXT: lda dj4, [p0], #-36; paddb [p4], #-48; mov p5, r5 -; ASM-NEXT: lda p4, [p4, #0]; st m1, [p5, #0] -; ASM-NEXT: lda r0, [p0], #-36; mov p5, sp -; ASM-NEXT: lda r5, [p0, #0]; paddb [p5], #-52 -; ASM-NEXT: lda p5, [p5, #0]; mov p0, sp -; ASM-NEXT: st m0, [p7, #0] -; ASM-NEXT: mov p7, sp -; ASM-NEXT: paddb [p7], #-56; st dj0, [p6, #0] -; ASM-NEXT: lda r6, [p7, #0]; mov p6, sp -; ASM-NEXT: paddb [p0], #-72; mov p7, sp -; ASM-NEXT: lda p0, [p0, #0]; paddb [p6], #-60; st dj4, [p4, #0] -; ASM-NEXT: lda r7, [p6, #0]; mov p4, sp -; ASM-NEXT: paddb [p4], #-76; mov p6, sp -; ASM-NEXT: lda r11, [p4, #0]; paddb [p7], #-64; mov p4, sp -; ASM-NEXT: lda p7, [p7, #0]; paddb [p6], #-68; st dn0, [p5, #0] -; ASM-NEXT: lda r8, [p6, #0]; paddb [p4], #-80; nez r0, r0; mov p5, r6 -; ASM-NEXT: lda p6, [p4, #0]; st dn4, [p5, #0]; movx r6, #1 -; ASM-NEXT: ne r4, r4, r6; mov p4, sp -; ASM-NEXT: mova r6, #3; paddb [p4], #-84; add r7, r2, #-1; mov p5, r7 -; ASM-NEXT: lda r9, [p4, #0]; ltu r7, r7, r6; mov p4, sp -; ASM-NEXT: st r0, [p5, #0]; paddb [p4], #-88; jz r7, #.LBB0_2 -; ASM-NEXT: lda r10, [p4, #0]; mov p4, sp // Delay Slot 5 -; ASM-NEXT: paddb [p4], #-92; st r5, [p7, #0] // Delay Slot 4 -; ASM-NEXT: lda p4, [p4, #0]; paddb [p2], m3; mov p7, r8 // Delay Slot 3 -; ASM-NEXT: st r4, [p7, #0]; paddb [p2], m5; and r8, r2, r6 // Delay Slot 2 -; ASM-NEXT: padda [p1], m2; paddb [p2], m4; movx r6, #0; st r8, [p0, #0] // Delay Slot 1 +; ASM-NEXT: nopb ; lda dn0, [p0, #88]; nops ; nopxm ; nopv +; ASM-NEXT: lda dj0, [p0, #80] +; ASM-NEXT: lda dn4, [p0, #92]; paddb [sp], #32 +; ASM-NEXT: lda r0, [p0, #32]; st p6, [sp, #-28] // 4-byte Folded Spill +; ASM-NEXT: lda r1, [p0, #20]; mov p6, sp +; ASM-NEXT: lda dj4, [p0, #84]; paddb [p6], #-36 +; ASM-NEXT: lda r3, [p6, #0]; mov p6, sp +; ASM-NEXT: lda m2, [p0, #40]; paddb [p6], #-40 +; ASM-NEXT: lda r5, [p6, #0]; mov p6, sp +; ASM-NEXT: lda m1, [p0, #60]; paddb [p6], #-44 +; ASM-NEXT: lda r8, [p6, #0]; mov p6, sp +; ASM-NEXT: lda m4, [p0, #36]; st r1, [p4, #0]; nez r2, r0 +; ASM-NEXT: lda m0, [p0, #96]; paddb [p6], #-48; st r2, [p5, #0] +; ASM-NEXT: lda r9, [p6, #0]; mov p6, sp +; ASM-NEXT: lda m5, [p0, #44]; paddb [p6], #-52; mov p4, r3 +; ASM-NEXT: lda r4, [p6, #0]; mov p6, sp +; ASM-NEXT: lda m3, [p0, #52]; st m1, [p4, #0] +; ASM-NEXT: lda r0, [p0, #48]; paddb [p6], #-56; mov p4, r5 +; ASM-NEXT: lda r6, [p6, #0]; mov p6, sp +; ASM-NEXT: lda r3, [p0, #64]; st m0, [p4, #0] +; ASM-NEXT: lda r5, [p0, #28]; paddb [p6], #-60; mov p4, r8 +; ASM-NEXT: lda r7, [p6, #0]; mov p6, sp +; ASM-NEXT: lda r4, [p0, #12]; mov p0, sp +; ASM-NEXT: paddb [p0], #-72; st dj0, [p4, #0] +; ASM-NEXT: lda p5, [p0, #0]; mov p4, r9 +; ASM-NEXT: mov p0, sp +; ASM-NEXT: paddb [p0], #-76; st dj4, [p4, #0] +; ASM-NEXT: lda r9, [p0, #0]; mov p0, sp +; ASM-NEXT: paddb [p0], #-80; mov p4, r4 +; ASM-NEXT: lda r10, [p0, #0]; mov p0, sp +; ASM-NEXT: paddb [p0], #-84 +; ASM-NEXT: lda r11, [p0, #0]; paddb [p6], #-64; mov p0, sp +; ASM-NEXT: lda p7, [p6, #0]; paddb [p0], #-88; mov p6, sp +; ASM-NEXT: lda r12, [p0, #0]; st dn0, [p4, #0] +; ASM-NEXT: mova r6, #1; paddb [p6], #-68; mov p0, r6 +; ASM-NEXT: lda p6, [p6, #0]; st dn4, [p0, #0]; ne r5, r5, r6 +; ASM-NEXT: mova r6, #3; add r7, r3, #-1; mov p0, r7 +; ASM-NEXT: ltu r7, r7, r6 +; ASM-NEXT: st p7, [sp, #-32]; jz r7, #.LBB0_2 // 4-byte Folded Spill +; ASM-NEXT: nez r0, r0; mov p4, sp // Delay Slot 5 +; ASM-NEXT: paddb [p4], #-92; st r0, [p0, #0] // Delay Slot 4 +; ASM-NEXT: lda p4, [p4, #0]; paddb [p2], m4; st r4, [p7, #0] // Delay Slot 3 +; ASM-NEXT: padda [p1], m2; paddb [p2], m5; and r8, r3, r6; st r5, [p6, #0] // Delay Slot 2 +; ASM-NEXT: mova r6, #0; paddb [p2], m3; st r8, [p5, #0] // Delay Slot 1 ; ASM-NEXT: // %bb.1: ; ASM-NEXT: nopb ; nopa ; nops ; j #.LBB0_5; nopv ; ASM-NEXT: nopa ; nopx // Delay Slot 5 @@ -92,17 +92,17 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; mov dc4, dc0 ; ASM-NEXT: vlda.3d.ups.s32.d8 cm2, s1, [p2], d0 ; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r3 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r2 ; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; mov s1, r1 ; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0 ; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0 ; ASM-NEXT: movxm ls, #.LBB0_3 ; ASM-NEXT: mova r6, #-4; movxm le, #.L_LEnd0 -; ASM-NEXT: and r2, r2, r6 -; ASM-NEXT: mova r6, #-2; add r2, r2, #-4 -; ASM-NEXT: lshl r2, r2, r6; mov crSRSSign, r4 -; ASM-NEXT: add r2, r2, #1; mov s0, r5 -; ASM-NEXT: add.nc lc, r2, #-1 +; ASM-NEXT: and r3, r3, r6 +; ASM-NEXT: mova r6, #-2; add r3, r3, #-4 +; ASM-NEXT: lshl r3, r3, r6; mov crSRSSign, r5 +; ASM-NEXT: add r3, r3, #1; mov s0, r4 +; ASM-NEXT: add.nc lc, r3, #-1 ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_3: // %for.body ; ASM-NEXT: // =>This Inner Loop Header: Depth=1 @@ -133,13 +133,14 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: mov crSRSSign, #0 ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_5: // %for.cond.cleanup.unr-lcssa.split -; ASM-NEXT: nopx ; mov p0, r10 -; ASM-NEXT: lda p7, [sp, #-32]; st r0, [p4, #0] // 4-byte Folded Reload -; ASM-NEXT: lda p6, [sp, #-28]; st r6, [p0, #0] // 4-byte Folded Reload -; ASM-NEXT: ret lr ; mov p0, r9 -; ASM-NEXT: st p3, [p0, #0] // Delay Slot 5 -; ASM-NEXT: mov p0, r11 // Delay Slot 4 -; ASM-NEXT: st p2, [p6, #0] // Delay Slot 3 +; ASM-NEXT: nopa ; mov p0, r12 +; ASM-NEXT: st r0, [p4, #0] +; ASM-NEXT: lda p7, [sp, #-32]; st r6, [p0, #0] // 4-byte Folded Reload +; ASM-NEXT: lda p6, [sp, #-28]; mov p0, r11 // 4-byte Folded Reload +; ASM-NEXT: st p3, [p0, #0]; ret lr +; ASM-NEXT: mov p0, r10 // Delay Slot 5 +; ASM-NEXT: st p2, [p0, #0] // Delay Slot 4 +; ASM-NEXT: mov p0, r9 // Delay Slot 3 ; ASM-NEXT: st p1, [p0, #0] // Delay Slot 2 ; ASM-NEXT: paddb [sp], #-32 // Delay Slot 1 newFuncRoot: diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll index dbfb46b618c2..998b261dff06 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll @@ -43,24 +43,24 @@ define dso_local void @lowerMemcpyUsingWordByte() local_unnamed_addr #0 { ; CHECK-LABEL: lowerMemcpyUsingWordByte: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; nopa ; nops ; movxm p1, #(buffer2+8); nopv -; CHECK-NEXT: lda.s8 r0, [p1, #0]; nopb ; movxm p0, #(buffer1+8); nops -; CHECK-NEXT: st.s8 r0, [p0, #0] +; CHECK-NEXT: nop ; movxm p0, #(buffer2+8) +; CHECK-NEXT: lda.s8 r0, [p0, #0] +; CHECK-NEXT: lda r1, [p0, #-4]; movxm p1, #(buffer1+8) +; CHECK-NEXT: st.s8 r0, [p1, #0] +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: paddb [p1], #-8 -; CHECK-NEXT: lda r0, [p1], #4 -; CHECK-NEXT: lda r1, [p1, #0] +; CHECK-NEXT: lda r0, [p0, #-8] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: paddb [p0], #-8 // Delay Slot 4 -; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r1, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: st r1, [p1, #-4] // Delay Slot 3 +; CHECK-NEXT: st r0, [p1, #-8] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 4 dereferenceable(9) @buffer1, ptr noundef nonnull align 4 dereferenceable(9) @buffer2, i32 9, i1 false) @@ -101,30 +101,31 @@ define dso_local void @lowerMemcpyUsingWordHalfByte() local_unnamed_addr #0 { ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; nopa ; nops ; movxm p0, #(buffer2+8); nopv -; CHECK-NEXT: lda.s16 r0, [p0, #0]; movxm p1, #(buffer1+8) +; CHECK-NEXT: lda.s16 r0, [p0, #0]; nopb ; nopx +; CHECK-NEXT: lda r1, [p0, #-4]; movxm p1, #(buffer1+8) ; CHECK-NEXT: st.s16 r0, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: paddb [p0], #-8 -; CHECK-NEXT: lda r0, [p0], #4; mov m0, #6 -; CHECK-NEXT: lda r1, [p0], m0 ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s8 r0, [p0, #2] +; CHECK-NEXT: st.s8 r0, [p1, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: paddb [p1], #-8 -; CHECK-NEXT: st r0, [p1], #4 -; CHECK-NEXT: lda.s8 r0, [p0, #0]; st r1, [p1], m0 -; CHECK-NEXT: st.s8 r0, [p1, #0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: lda r0, [p0, #-8] +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: st r1, [p1, #-4] // Delay Slot 3 +; CHECK-NEXT: st r0, [p1, #-8] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 4 dereferenceable(11) @buffer1, ptr noundef nonnull align 4 dereferenceable(11) @buffer2, i32 11, i1 false) diff --git a/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll b/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll index 8e83e5f1b89b..7a551835036f 100644 --- a/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll +++ b/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll @@ -123,6 +123,7 @@ ; AIE-O123-NEXT: MachineDominator Tree Construction ; AIE-O123-NEXT: AIE2 Post Legalizer Generic Combiner ; AIE-O123-NEXT: AIE Base Address Clustering Optimization +; AIE-O123-NEXT: AIE Pointer Modifier Optimization ; AIE-O123-NEXT: AIE2 Post Legalizer Custom Combiner ; AIE-O0123-NEXT: RegBankSelect diff --git a/llvm/test/CodeGen/AIE/aie2/mmo-load.ll b/llvm/test/CodeGen/AIE/aie2/mmo-load.ll index 731f9c113744..cc1eb5889d7f 100644 --- a/llvm/test/CodeGen/AIE/aie2/mmo-load.ll +++ b/llvm/test/CodeGen/AIE/aie2/mmo-load.ll @@ -80,9 +80,9 @@ define void @load_v16i32(i32 %idx, ptr %array) { ; CHECK-SAME: (load (<8 x s32>) from %ir.arrayidx.1 + 32) ; CHECK: VLDA_dmw_lda_w_ag_idx_imm ; CHECK-SAME: (load (<8 x s32>) from %ir.arrayidx.1) - ; CHECK: VLD_idx_imm_3x32_pseudo + ; CHECK: VLDA_dmw_lda_w_ag_idx_imm ; CHECK-SAME: (load (<8 x s32>) from %ir.arrayidx.0 + 32) - ; CHECK: VLD_idx_imm_3x32_pseudo + ; CHECK: VLDA_dmw_lda_w_ag_idx_imm ; CHECK-SAME: (load (<8 x s32>) from %ir.arrayidx.0) entry: %arrayidx.1 = getelementptr inbounds [16 x <16 x i32>], ptr %array, i32 0, i32 2 diff --git a/llvm/test/CodeGen/AIE/aie2/movxm_test.ll b/llvm/test/CodeGen/AIE/aie2/movxm_test.ll index 1691efb33b7d..0209d95d71e6 100644 --- a/llvm/test/CodeGen/AIE/aie2/movxm_test.ll +++ b/llvm/test/CodeGen/AIE/aie2/movxm_test.ll @@ -68,18 +68,17 @@ define dso_local noundef i32 @bar() { ; CHECK-LABEL: bar: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; nopa ; nops ; movxm p0, #(Y+4); nopv -; CHECK-NEXT: lda.s16 r0, [p0, #-4]; nopx ; mov m0, #-2 -; CHECK-NEXT: lda.s8 r1, [p0, #0]; paddb [p0], m0 -; CHECK-NEXT: lda.s16 r2, [p0], #6 -; CHECK-NEXT: nop -; CHECK-NEXT: lda r3, [p0, #0] +; CHECK-NEXT: nopa ; movxm p0, #(Y+4) +; CHECK-NEXT: lda.s16 r0, [p0, #-2] +; CHECK-NEXT: lda.s16 r1, [p0, #-4] +; CHECK-NEXT: lda.s8 r2, [p0, #0] +; CHECK-NEXT: lda r3, [p0, #4] ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: add r0, r2, r0 // Delay Slot 3 -; CHECK-NEXT: add r0, r0, r1 // Delay Slot 2 +; CHECK-NEXT: add r0, r0, r1 // Delay Slot 3 +; CHECK-NEXT: add r0, r0, r2 // Delay Slot 2 ; CHECK-NEXT: add r0, r0, r3 // Delay Slot 1 entry: %0 = load i16, ptr getelementptr inbounds (%struct.test, ptr @Y, i32 0, i32 1), align 2 diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir index 2b7726405ab9..a0b092679584 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir @@ -9,6 +9,7 @@ # RUN: llc --mtriple=aie2p -verify-machineinstrs \ # RUN: --start-before=aie2p-postlegalizer-generic-combiner \ # RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: --aie-global-ptr-mod-opt=true \ # RUN: -o - %s | FileCheck %s # Consider MBB Crossing in Selection Decision @@ -28,8 +29,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 192 - ; CHECK-NEXT: G_STORE [[C1]](s32), [[COPY]](p0) :: (store (s32)) - ; CHECK-NEXT: G_AIE_OFFSET_STORE [[C1]](s32), [[COPY]](p0), [[C]](s20) :: (store (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[C1]](s32), [[COPY]], [[C]](s20) :: (store (s32)) + ; CHECK-NEXT: G_STORE [[C1]](s32), [[AIE_POSTINC_STORE]](p0) :: (store (s32)) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: G_STORE [[C1]](s32), [[COPY]](p0) :: (store (s32)) @@ -47,7 +48,7 @@ body: | PseudoRET implicit $lr ... -# %3(p0) is defined (bb.0) outside of the use in MBB bb.1 +# %3(p0) is defined (bb.0) outside of the use in MBB bb.1 --- name: def-outside-loop-dominance legalized: true @@ -96,7 +97,7 @@ body: | PseudoRET implicit $lr ... -# %0 is used to define a G_PTR_ADD (%10) in a different block (bb.2) than where +# %0 is used to define a G_PTR_ADD (%10) in a different block (bb.2) than where # %0 is otherwise used (bb.1) --- name: ptr-add-after-loop @@ -121,8 +122,8 @@ body: | ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %6(s32), %bb.1, [[C3]](s32), %bb.0 ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[C4]] - ; CHECK-NEXT: G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32)) - ; CHECK-NEXT: G_AIE_OFFSET_STORE [[ADD]](s32), [[COPY]](p0), [[C2]](s20) :: (store (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[ADD]](s32), [[COPY]], [[C2]](s20) :: (store (s32)) + ; CHECK-NEXT: G_STORE [[ADD]](s32), [[AIE_POSTINC_STORE]](p0) :: (store (s32)) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[ADD]](s32), [[C1]] ; CHECK-NEXT: G_BRCOND [[ICMP]](s32), %bb.1 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir index 0c7b51150ee5..cf143612076a 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir @@ -9,6 +9,7 @@ # RUN: llc --mtriple=aie2p -verify-machineinstrs \ # RUN: --start-before=aie2p-postlegalizer-generic-combiner \ # RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: --aie-global-ptr-mod-opt=true \ # RUN: -o - %s | FileCheck %s # Verify that Gemm has the correct assignment @@ -39,16 +40,17 @@ body: | ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(p0) = G_PHI [[COPY]](p0), %bb.0, %9(p0), %bb.1 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI]], [[C2]] ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[ADD]](s32), [[C]] - ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[PHI1]](p0), [[C1]](s20), [[C1]](s20), [[C1]](s20), [[C1]](s20), [[C1]](s20), [[C1]](s20), [[C1]](s20) + ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[PHI1]], [[C1]], [[C1]], [[C1]], [[C1]], [[C1]], [[C1]], [[C1]] :: (load (<32 x s16>)) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[PHI1]], %configZero(s20) :: (load (<32 x s16>)) ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<32 x s16>) = G_AIE_OFFSET_LOAD [[PHI1]](p0), [[C4]](s20) :: (load (<32 x s16>)) - ; CHECK-NEXT: [[AIE_POSTINC_LOAD2:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_LOAD3:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[AIE_POSTINC_LOAD1]], [[C4]](s20) :: (load (<32 x s16>)) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<32 x s16>) = G_LOAD [[AIE_POSTINC_LOAD3]](p0) :: (load (<32 x s16>)) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI1]], %configZero(s20) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s20) = G_CONSTANT i20 0 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(<32 x s16>) = G_AIE_OFFSET_LOAD [[PTR_ADD]](p0), [[C5]](s20) :: (load (<32 x s16>)) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD2:%[0-9]+]]:_(<32 x s16>) = G_AIE_OFFSET_LOAD [[PTR_ADD]](p0), [[C4]](s20) :: (load (<32 x s16>)) ; CHECK-NEXT: G_BRCOND [[ICMP]](s32), %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD]](<32 x s16>), implicit [[AIE_OFFSET_LOAD]](<32 x s16>), implicit [[AIE_POSTINC_LOAD2]](<32 x s16>), implicit [[LOAD]](<32 x s16>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_3D_LOAD]](<32 x s16>), implicit [[AIE_OFFSET_LOAD]](<32 x s16>), implicit [[AIE_OFFSET_LOAD1]](<32 x s16>), implicit [[AIE_OFFSET_LOAD2]](<32 x s16>) bb.0: liveins: $p0, $p1 %0:_(p0) = COPY $p0 diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/greedy-log.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/greedy-log.mir new file mode 100644 index 000000000000..c3b50122d44b --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/greedy-log.mir @@ -0,0 +1,58 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2p -verify-machineinstrs \ +# RUN: --start-before=aie2p-postlegalizer-generic-combiner \ +# RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: --aie-global-ptr-mod-opt=true \ +# RUN: --aie-address-chaining=true \ +# RUN: -o /dev/null %s --debug-only=global-combiner 2>&1 | FileCheck %s + +# REQUIRES: asserts + +# Overlap Gain should only be counted for post-increments, since pre-increments +# don't support Immediate sharing yet (ptr_adds tend to be accumulated and thus +# the offset is currently no calculated properly). +# If overlap gain is applied indiscriminantly, Offset[1] maximum gain is +# calculated wrong +--- +name: overlap-gain-estimation +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0 + ; CHECK: - Combiner Search Start + ; CHECK: [Greedy] Added PostInc[0] (%3:_,%2:_,) [1,0,0,] + ; CHECK: Immediate Register Reuse, using Ideal Gain + ; CHECK: [Greedy] Added PostInc[2] (%4:_,%2:_,) [1,0,0,] + ; CHECK: Immediate Register Reuse, using Ideal Gain + ; CHECK: Immediate Register Reuse, using Ideal Gain + ; CHECK: [Greedy] Added PostInc[4] (%5:_,%2:_,) [1,0,0,] + ; CHECK: Immediate Register Reuse, using Ideal Gain + ; CHECK: Immediate Register Reuse, using Ideal Gain + ; CHECK: [Greedy] Added PostInc[6] (%6:_,%2:_,) [1,0,0,] + ; CHECK: Immediate Register Reuse, using Ideal Gain + %0:_(p0) = COPY $p0 + %1:_(s20) = G_CONSTANT i20 785 + %2:_(s32) = G_CONSTANT i32 192 + G_STORE %2, %0(p0) :: (store (s32)) + %3:_(p0) = G_PTR_ADD %0, %1 + G_STORE %2(s32), %3(p0) :: (store (s32)) + %4:_(p0) = G_PTR_ADD %3, %1 + G_STORE %2(s32), %4(p0) :: (store (s32)) + %5:_(p0) = G_PTR_ADD %4, %1 + G_STORE %2(s32), %5(p0) :: (store (s32)) + %6:_(p0) = G_PTR_ADD %5, %1 + G_STORE %2(s32), %6(p0) :: (store (s32)) + + PseudoRET implicit $lr +... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/mixture-offset-postinc-selection.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/mixture-offset-postinc-selection.mir index ab1b8d05bc0c..419847079826 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/mixture-offset-postinc-selection.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/mixture-offset-postinc-selection.mir @@ -10,6 +10,7 @@ # RUN: --start-before=aie2p-postlegalizer-generic-combiner \ # RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ # RUN: --aie-address-chaining=true \ +# RUN: --aie-global-ptr-mod-opt=true \ # RUN: -o - %s | FileCheck %s # store hinders addr-chaining and thus leads to suboptimal assignments. diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/movability-check.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/movability-check.mir index 18dbf15931bc..b54719cb4f80 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/movability-check.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/movability-check.mir @@ -13,15 +13,15 @@ # Check Movability -# Check that PTR_ADD which has a lower DAG depth than the CombineRoot (load), +# Check that PTR_ADD which has a lower DAG depth than the CombineRoot (load), # can be combined --- -name: gemm-kernel +name: move-up-check legalized: true alignment: 16 tracksRegLiveness: true body: | - ; CHECK-LABEL: name: gemm-kernel + ; CHECK-LABEL: name: move-up-check ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $p0, $p1 @@ -44,6 +44,71 @@ body: | %7:_(p0) = G_PTR_ADD %0(p0), %6(s20) PseudoRET implicit $lr, implicit %5, implicit %7 ... + +--- +name: move-multiple-preds +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: move-multiple-preds + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: %Res:_(s32), %7:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit %Res(s32), implicit %7(p0) + bb.0: + liveins: $p0, $p1 + %0:_(p0) = COPY $p0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(s20) = G_CONSTANT i20 64 + %3:_(s32) = G_CONSTANT i32 -1 + bb.1: + %Res:_(s32) = G_LOAD %0(p0) :: (load (s32)) + %4:_(s32) = G_CONSTANT i32 8 + %5:_(s20) = G_TRUNC %4 + %7:_(p0) = G_PTR_ADD %0(p0), %5(s20) + PseudoRET implicit $lr, implicit %Res, implicit %7 +... + + +--- +name: detect-no-movability +legalized: true +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: detect-no-movability + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 128 + ; CHECK-NEXT: G_STORE [[C]](s20), [[COPY]](p0) :: (store (s20), align 4) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s20) = G_LOAD [[COPY]](p0) :: (load (s20), align 4) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[LOAD]](s20) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0) + bb.0: + liveins: $p0, $p1 + %0:_(p0) = COPY $p0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(s20) = G_CONSTANT i20 64 + %3:_(s32) = G_CONSTANT i32 -1 + bb.1: + %4:_(s20) = G_ADD %2, %2 + G_STORE %4(s20), %0(p0) :: (store (s20)) + %5:_(s20) = G_LOAD %0(p0) :: (load (s20)) + %7:_(p0) = G_PTR_ADD %0(p0), %5(s20) + PseudoRET implicit $lr, implicit %7 ... diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/overlap-gain.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/overlap-gain.mir index 1d3be84328d8..380bb0ebc0ea 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/overlap-gain.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/overlap-gain.mir @@ -10,6 +10,7 @@ # RUN: --start-before=aie2p-postlegalizer-generic-combiner \ # RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ # RUN: --aie-address-chaining=false \ +# RUN: --aie-global-ptr-mod-opt=true \ # RUN: -o - %s | FileCheck %s # Check that there is no conflict between pointer independent post-increments @@ -61,8 +62,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 192 - ; CHECK-NEXT: G_STORE [[C1]](s32), [[COPY]](p0) :: (store (s32)) ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[C1]](s32), [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (s32)) + ; CHECK-NEXT: G_STORE [[C1]](s32), [[COPY]](p0) :: (store (s32)) ; CHECK-NEXT: G_STORE [[C1]](s32), [[AIE_POSTINC_3D_STORE]](p0) :: (store (s32)) ; CHECK-NEXT: PseudoRET implicit $lr %0:_(p0) = COPY $p0 diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir index 803e19fa2e0a..9d3795839f82 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir @@ -10,6 +10,7 @@ # RUN: --start-before=aie2p-postlegalizer-generic-combiner \ # RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ # RUN: --aie-address-chaining=false \ +# RUN: --aie-global-ptr-mod-opt=true \ # RUN: -o - %s | FileCheck %s # The post-increment with the first memory instruction causes a pointer copy, @@ -31,9 +32,10 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 192 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[C1]](s32), [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (s32)) - ; CHECK-NEXT: G_AIE_OFFSET_STORE [[C1]](s32), [[COPY]](p0), [[C2]](s20) :: (store (s32)) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_3D_STORE]](p0) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[C1]](s32), [[COPY]], [[C2]](s20) :: (store (s32)) + ; CHECK-NEXT: G_STORE [[C1]](s32), [[AIE_POSTINC_STORE]](p0) :: (store (s32)) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[INT]](p0) %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 %2:_(s32) = G_CONSTANT i32 192 @@ -60,9 +62,10 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: %lZero:_(s32), %7:_(p0), %8:_(s20), %9:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) - ; CHECK-NEXT: %lOne:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (s32)) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit %7(p0), implicit %lZero(s32), implicit %lOne(s32) + ; CHECK-NEXT: %lZero:_(s32), %5:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C1]](s20) :: (load (s32)) + ; CHECK-NEXT: %lOne:_(s32) = G_LOAD %5(p0) :: (load (s32)) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[INT]](p0), implicit %lZero(s32), implicit %lOne(s32) %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 %2:_(s32) = G_CONSTANT i32 192 diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir index e34bbaa37880..d12dca151822 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir @@ -10,9 +10,10 @@ # RUN: --start-before=aie2p-postlegalizer-generic-combiner \ # RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ # RUN: --aie-address-chaining=false \ +# RUN: --aie-global-ptr-mod-opt=true \ # RUN: -o - %s | FileCheck %s -# Reorder Loads, so that the pointer does not have to be restored for the +# Reorder Loads, so that the pointer does not have to be restored for the # second Load Instr --- name: reorder-loads @@ -27,10 +28,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: %sZero:_(s32), %6:_(p0), %7:_(s20), %8:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) - ; CHECK-NEXT: %sOne:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: %sZero:_(s32), %4:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: %sOne:_(s32) = G_LOAD %4(p0) :: (load (s32)) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD %sZero, %sOne - ; CHECK-NEXT: G_STORE [[ADD]](s32), %6(p0) :: (store (s32)) + ; CHECK-NEXT: G_STORE [[ADD]](s32), [[INT]](p0) :: (store (s32)) ; CHECK-NEXT: PseudoRET implicit $lr %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/shared-postinc-constants.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/shared-postinc-constants.mir index 49a25a9947cd..f44cf785a03a 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/shared-postinc-constants.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/shared-postinc-constants.mir @@ -9,6 +9,7 @@ # RUN: llc --mtriple=aie2p -verify-machineinstrs \ # RUN: --start-before=aie2p-postlegalizer-generic-combiner \ # RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ +# RUN: --aie-global-ptr-mod-opt=true \ # RUN: --aie-address-chaining=true -o - %s | FileCheck %s # overlay gain: Do not penalize Immediates that can be reused, if the Immediates diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir index 2455684ee8a6..dc23545aaa57 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir @@ -10,6 +10,7 @@ # RUN: --start-before=aie2p-postlegalizer-generic-combiner \ # RUN: --stop-after=aie2p-postlegalizer-custom-combiner \ # RUN: --aie-address-chaining=false \ +# RUN: --aie-global-ptr-mod-opt=true \ # RUN: -o - %s | FileCheck %s # properly assign user intrinsics @@ -28,11 +29,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) - ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) - ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD4:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_LOAD5:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD6:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD7:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) - ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY1]](p0), [[C]](s20) :: (load (s32)) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_3D_LOAD]](s32), implicit [[AIE_OFFSET_LOAD]](s32), implicit [[AIE_POSTINC_3D_LOAD4]](s32), implicit [[AIE_OFFSET_LOAD1]](s32), implicit [[AIE_POSTINC_3D_LOAD1]](p0), implicit [[AIE_POSTINC_3D_LOAD5]](p0) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[AIE_POSTINC_LOAD1]](p0) :: (load (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD2:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD3:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY1]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[AIE_POSTINC_LOAD3]](p0) :: (load (s32)) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) + ; CHECK-NEXT: [[INT3:%[0-9]+]]:_(p0), [[INT4:%[0-9]+]]:_(s20), [[INT5:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY1]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD]](s32), implicit [[LOAD]](s32), implicit [[AIE_POSTINC_LOAD2]](s32), implicit [[LOAD1]](s32), implicit [[INT]](p0), implicit [[INT3]](p0) %0:_(p0) = COPY $p0 %1:_(p0) = COPY $p1 %2:_(s20) = G_CONSTANT i20 8 diff --git a/llvm/test/CodeGen/AIE/aie2p/Memops.ll b/llvm/test/CodeGen/AIE/aie2p/Memops.ll index 8118c5fb4b4b..bf1305c7cc66 100644 --- a/llvm/test/CodeGen/AIE/aie2p/Memops.ll +++ b/llvm/test/CodeGen/AIE/aie2p/Memops.ll @@ -43,24 +43,24 @@ define dso_local void @lowerMemcpyUsingWordByte() local_unnamed_addr #0 { ; CHECK-LABEL: lowerMemcpyUsingWordByte: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; movxm p0, ##(buffer2+8); nops -; CHECK-NEXT: lda.s8 r0, [p0, #0]; movxm p1, ##(buffer1+8) +; CHECK-NEXT: movxm p0, ##(buffer2+8) +; CHECK-NEXT: lda.s8 r0, [p0, #0]; nopx +; CHECK-NEXT: lda r1, [p0, #-4]; movxm p1, ##(buffer1+8) ; CHECK-NEXT: st.s8 r0, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova m0, #-8 -; CHECK-NEXT: padda [p0], m0 -; CHECK-NEXT: lda r0, [p0], #4 -; CHECK-NEXT: lda r1, [p0, #0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: lda r0, [p0, #-8] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: movxm p0, ##(buffer1+8) // Delay Slot 5 -; CHECK-NEXT: padda [p0], m0 // Delay Slot 4 -; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r1, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: st r1, [p1, #-4] // Delay Slot 3 +; CHECK-NEXT: st r0, [p1, #-8] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 4 dereferenceable(9) @buffer1, ptr noundef nonnull align 4 dereferenceable(9) @buffer2, i32 9, i1 false) @@ -100,31 +100,32 @@ define dso_local void @lowerMemcpyUsingWordHalfByte() local_unnamed_addr #0 { ; CHECK-LABEL: lowerMemcpyUsingWordHalfByte: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; movxm p0, ##(buffer2+8); nops -; CHECK-NEXT: lda.s16 r0, [p0, #0]; movxm p1, ##(buffer1+8) +; CHECK-NEXT: nopa ; nopb ; nops ; movxm p0, ##(buffer2+8); nopv +; CHECK-NEXT: lda.s16 r0, [p0, #0]; nopb ; nopx +; CHECK-NEXT: lda r1, [p0, #-4]; movxm p1, ##(buffer1+8) ; CHECK-NEXT: st.s16 r0, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova m0, #-8 -; CHECK-NEXT: padda [p0], m0 -; CHECK-NEXT: lda r0, [p0], #4; mov m1, #6 -; CHECK-NEXT: lda r1, [p0], m1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s8 r0, [p0, #2] +; CHECK-NEXT: st.s8 r0, [p1, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: padda [p1], m0 -; CHECK-NEXT: st r0, [p1], #4 -; CHECK-NEXT: lda.s8 r0, [p0, #0]; st r1, [p1], m1 -; CHECK-NEXT: st.s8 r0, [p1, #0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: lda r0, [p0, #-8] +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: st r1, [p1, #-4] // Delay Slot 3 +; CHECK-NEXT: st r0, [p1, #-8] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 4 dereferenceable(11) @buffer1, ptr noundef nonnull align 4 dereferenceable(11) @buffer2, i32 11, i1 false) diff --git a/llvm/test/CodeGen/AIE/aie2p/combine-loads-stores.mir b/llvm/test/CodeGen/AIE/aie2p/combine-loads-stores.mir index 514ba9c2bdec..f7eb71431f53 100644 --- a/llvm/test/CodeGen/AIE/aie2p/combine-loads-stores.mir +++ b/llvm/test/CodeGen/AIE/aie2p/combine-loads-stores.mir @@ -5,7 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # # (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates -# RUN: llc -mtriple aie2p -run-pass=aie2p-postlegalizer-custom-combiner %s -verify-machineinstrs -o - | FileCheck %s +# RUN: llc -mtriple aie2p --start-before=aie2p-postlegalizer-generic-combiner --stop-after=aie2p-postlegalizer-custom-combiner -verify-machineinstrs -o - %s | FileCheck %s --- name: load_to_preinc @@ -161,9 +161,7 @@ body: | ... -# Our current combine code is not able to move the memory operation up. In this -# case we cannot just move the pointer add to the load and we therefore don't -# combine. This could be improved. +# Move PTR_ADD Instruction down to the Memory Instruction. --- name: load_not_to_postinc_ptradd_before_load body: | @@ -175,10 +173,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20) - ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32)) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) %0:_(p0) = COPY $p0 %1:_(s32) = G_CONSTANT i32 24 %2:_(s20) = G_TRUNC %1 @@ -211,6 +208,28 @@ body: | $p0 = COPY %4 ... +# removed truncation from load_to_postinc_move_offset +--- +name: load_to_postinc_move_offset_no_trunc +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: load_to_postinc_move_offset_no_trunc + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 12 + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) + %0:_(p0) = COPY $p0 + %3:_(s32) = G_LOAD %0 :: (load (s32)) + %1:_(s20) = G_CONSTANT i20 12 + %4:_(p0) = G_PTR_ADD %0, %1 + $r0 = COPY %3 + $p0 = COPY %4 +... + --- name: load_not_to_postinc_cannot_move_offset body: | @@ -220,12 +239,11 @@ body: | ; CHECK: liveins: $p0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20) - ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) %0:_(p0) = COPY $p0 %3:_(s32) = G_LOAD %0 :: (load (s32)) $r0 = COPY %3 @@ -235,6 +253,8 @@ body: | $p0 = COPY %4 ... +# Note: Global Combiners cannot determine movability with processed constants + --- name: load_to_postinc_arg_offset body: | @@ -366,9 +386,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 24 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) - ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[COPY1]](s32), [[COPY]], [[C]](s20) :: (store (s32)) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_STORE]](p0) %0:_(p0) = COPY $p0 %1:_(s32) = COPY $r0 %2:_(s20) = G_CONSTANT i20 24 @@ -420,6 +439,28 @@ body: | $p0 = COPY %3 ... +# removed truncation from store_to_postinc_move_offset +--- +name: store_to_postinc_move_offset_no_truncation +body: | + bb.0: + liveins: $p0, $r0 + ; CHECK-LABEL: name: store_to_postinc_move_offset_no_truncation + ; CHECK: liveins: $p0, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 24 + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[COPY1]](s32), [[COPY]], [[C]](s20) :: (store (s32)) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_STORE]](p0) + %0:_(p0) = COPY $p0 + %1:_(s32) = COPY $r0 + G_STORE %1, %0 :: (store (s32)) + %2:_(s20) = G_CONSTANT i20 24 + %3:_(p0) = G_PTR_ADD %0, %2 + $p0 = COPY %3 +... + --- name: store_not_to_postinc_cannot_move_offset body: | @@ -430,13 +471,12 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0 - ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p0) :: (store (s32)) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $p1 - ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY2]](p0) :: (store (s32)) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20) - ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[COPY1]](s32), [[COPY]], [[TRUNC]](s20) :: (store (s32)) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $p1 + ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY2]](p0) :: (store (s32)) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_STORE]](p0) %0:_(p0) = COPY $p0 %1:_(s32) = COPY $r0 G_STORE %1, %0 :: (store (s32)) @@ -618,10 +658,9 @@ body: | ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20) - ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32)) + ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) %1:_(s32) = G_CONSTANT i32 24 %4:_(s20) = G_TRUNC %1 %0:_(p0) = COPY $p0 @@ -639,12 +678,11 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[TRUNC]](s20) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s20) = G_TRUNC [[C1]](s32) - ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC1]](s20) :: (load (s32)) - ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) - ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[TRUNC]](s20) :: (load (s32)) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 0 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[AIE_POSTINC_LOAD1]], [[C1]](s20) + ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_LOAD1]](p0) ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) %2:_(p0) = COPY $p0 %7:_(s32) = G_LOAD %2 :: (load (s32)) @@ -981,7 +1019,6 @@ body: | # %0 in bb.1) then do not combine to a post increment because that would lead to # an additional COPY to preserve the original pointer. # Note: If the copy is inevitable (in this case it is) we might as well combine. -# Our current implementation does not consider this case --- name: not_combine_postinc_later_use body: | @@ -992,13 +1029,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s20) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0), implicit [[PTR_ADD1]](p0), implicit [[COPY]](p0) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0), implicit [[PTR_ADD]](p0), implicit [[COPY]](p0) bb.0: %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 @@ -1023,9 +1059,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) ; CHECK-NEXT: PseudoJNZ $r1, %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: @@ -1040,7 +1075,7 @@ body: | ; CHECK-NEXT: PseudoJ_jump_imm %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0) bb.0: %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 @@ -1069,9 +1104,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) ; CHECK-NEXT: PseudoJNZ $r1, %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: @@ -1086,7 +1120,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: $p0 = COPY [[COPY]](p0) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0) bb.0: %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 @@ -1152,8 +1186,6 @@ body: | ... # In this case we would want to combine the postincrement. -# But the current heuristic is too conservative and since the use in bb.2 does -# not dominate the combined instruction in bb.1 it aborts the combining. --- name: postinc_bb_1_use_bb_2 body: | @@ -1168,9 +1200,8 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) ; CHECK-NEXT: PseudoJ_jump_imm %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -1180,7 +1211,7 @@ body: | ; CHECK-NEXT: PseudoJ_jump_imm %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0) bb.0: %0:_(p0) = COPY $p0 PseudoJNZ $r1, %bb.2 @@ -1214,9 +1245,8 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) ; CHECK-NEXT: PseudoJ_jump_imm %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -1226,7 +1256,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: $p0 = COPY [[COPY]](p0) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0) bb.0: %0:_(p0) = COPY $p0 PseudoJNZ $r1, %bb.2 @@ -1292,8 +1322,6 @@ body: | ... # In this case we would want to combine the postincrement. -# But the current heuristic is too conservative and since the use in bb.1 does -# not dominate the combined instruction in bb.3 it aborts the combining. --- name: postinc_bb_3_use_bb_1 body: | @@ -1317,10 +1345,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) - ; CHECK-NEXT: $r0 = COPY [[LOAD]](s32) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[PTR_ADD]](p0) + ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD1]](p0) bb.0: %0:_(p0) = COPY $p0 PseudoJNZ $r1, %bb.2 @@ -1348,13 +1375,13 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<32 x s8>)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s8>), [[COPY1]], [[C]](s20) :: (store (<32 x s8>)) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20) - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s8>), [[COPY1]], [[C1]](s20) :: (store (<32 x s8>)) - ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0) - ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_STORE]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_STORE]](p0) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 0 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[AIE_POSTINC_STORE]], [[C1]](s20) + ; CHECK-NEXT: $p2 = COPY [[PTR_ADD]](p0) ; CHECK-NEXT: $wl0 = COPY [[AIE_POSTINC_LOAD]](<32 x s8>) %0:_(p0) = COPY $p0 %6:_(p0) = COPY $p1 @@ -1380,11 +1407,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[AIE_POSTINC_2D_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_2D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_LOAD2:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s8>)) + ; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[AIE_POSTINC_2D_LOAD]](<32 x s8>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s8>)) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_2D_LOAD1]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_2D_STORE]](p0) ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.2d), [[COPY1]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) - ; CHECK-NEXT: [[AIE_POSTINC_2D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_2D_STORE1:%[0-9]+]]:_(s20) = G_AIE_POSTINC_2D_STORE [[AIE_POSTINC_2D_LOAD]](<32 x s8>), [[COPY1]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s8>)) - ; CHECK-NEXT: $p1 = COPY [[INT]](p0) - ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_2D_STORE]](p0) + ; CHECK-NEXT: $p2 = COPY [[INT]](p0) ; CHECK-NEXT: $wl0 = COPY [[AIE_POSTINC_2D_LOAD]](<32 x s8>) %0:_(p0) = COPY $p0 %6:_(p0) = COPY $p1 @@ -1409,11 +1436,11 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s8>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (<32 x s8>)) + ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[AIE_POSTINC_3D_LOAD]](<32 x s8>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s8>)) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_3D_LOAD1]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_3D_STORE]](p0) ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY1]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) - ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[AIE_POSTINC_3D_LOAD]](<32 x s8>), [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (<32 x s8>)) - ; CHECK-NEXT: $p1 = COPY [[INT]](p0) - ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_3D_STORE]](p0) + ; CHECK-NEXT: $p2 = COPY [[INT]](p0) ; CHECK-NEXT: $wl0 = COPY [[AIE_POSTINC_3D_LOAD]](<32 x s8>) %0:_(p0) = COPY $p0 %6:_(p0) = COPY $p1 @@ -1464,13 +1491,13 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<32 x s16>)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s16>), [[COPY1]], [[C]](s20) :: (store (<32 x s16>)) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20) - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s16>), [[COPY1]], [[C1]](s20) :: (store (<32 x s16>)) - ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0) - ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_STORE]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_STORE]](p0) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 0 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[AIE_POSTINC_STORE]], [[C1]](s20) + ; CHECK-NEXT: $p2 = COPY [[PTR_ADD]](p0) %0:_(p0) = COPY $p0 %6:_(p0) = COPY $p1 %1:_(s20) = G_CONSTANT i20 64 @@ -1539,13 +1566,13 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(<32 x s32>), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (<32 x s32>)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s32>), [[COPY1]], [[C]](s20) :: (store (<32 x s32>)) ; CHECK-NEXT: $p0 = COPY [[AIE_POSTINC_LOAD1]](p0) - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20) - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](<32 x s32>), [[COPY1]], [[C1]](s20) :: (store (<32 x s32>)) - ; CHECK-NEXT: $p1 = COPY [[PTR_ADD]](p0) - ; CHECK-NEXT: $p2 = COPY [[AIE_POSTINC_STORE]](p0) + ; CHECK-NEXT: $p1 = COPY [[AIE_POSTINC_STORE]](p0) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 0 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[AIE_POSTINC_STORE]], [[C1]](s20) + ; CHECK-NEXT: $p2 = COPY [[PTR_ADD]](p0) %0:_(p0) = COPY $p0 %6:_(p0) = COPY $p1 %1:_(s20) = G_CONSTANT i20 64 @@ -1568,13 +1595,13 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<64 x s32>) = G_LOAD [[COPY]](p0) :: (load (<64 x s32>)) ; CHECK-NEXT: G_STORE [[LOAD]](<64 x s32>), [[COPY1]](p0) :: (store (<64 x s32>)) ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) ; CHECK-NEXT: $p0 = COPY [[PTR_ADD]](p0) ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C]](s20) ; CHECK-NEXT: $p1 = COPY [[PTR_ADD1]](p0) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C1]](s20) ; CHECK-NEXT: $p2 = COPY [[PTR_ADD2]](p0) %0:_(p0) = COPY $p0 diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll index c749e9ab9ceb..88242ad52468 100644 --- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll @@ -16,26 +16,24 @@ define weak_odr dso_local void @convert_bf16_to_bfp16(ptr noalias %in, ptr noali ; CHECK-LABEL: convert_bf16_to_bfp16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda r0, [p2, #0]; nopb ; nops ; nopx ; mov m0, #4; nopv -; CHECK-NEXT: padda [p2], m0; nopx -; CHECK-NEXT: lda dn0, [p2], #4 -; CHECK-NEXT: lda m1, [p2], #4 +; CHECK-NEXT: lda dn0, [p2, #4]; nopx +; CHECK-NEXT: lda m1, [p2, #8] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movx r24, #0 +; CHECK-NEXT: mova r24, #0 ; CHECK-NEXT: mova dj0, #0; mov r26, r24 -; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24]; mov dj1, dj0 +; CHECK-NEXT: lda r0, [p2, #0]; vldb.fill.512 [p0, lf0, r24]; mov dj1, dj0 ; CHECK-NEXT: movs dc1, dj0; vldb.pop.512 x0, [p0, lf0, r24]; mov dn1, dn0 ; CHECK-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1] ; CHECK-NEXT: nop ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] -; CHECK-NEXT: lda m0, [p2, #0]; vldb.pop.512 x0, [p0, lf0, r24]; movxm ls, #.LBB0_1 +; CHECK-NEXT: lda m0, [p2, #12]; vldb.pop.512 x0, [p0, lf0, r24]; movxm ls, #.LBB0_1 ; CHECK-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; movxm le, #.L_LEnd0 ; CHECK-NEXT: add.nc lc, r0, #-3 ; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; nops ; nopx ; vconv.fp32.bf16 cml0, x0; nopv ; CHECK-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; nops ; nopx ; vconv.fp32.bf16 cmh0, x2; nopv -; CHECK-NEXT: nopa ; nopb ; movs dc0, dj0; nopx ; mov p2, p1; nopv +; CHECK-NEXT: nopa ; nopb ; movs p2, p1; nopx ; mov dc0, dj0; nopv ; CHECK-NEXT: // implicit-def: $sf ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body diff --git a/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll index bface0a7a6ed..24a3c7b91cf1 100644 --- a/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll +++ b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll @@ -17,24 +17,24 @@ define dso_local void @_Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t(ptr nocap ; CHECK-LABEL: _Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda p0, [p0, #0]; mov p3, p1 +; CHECK-NEXT: vlda lfl0, [p3], #128 +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -56,24 +56,24 @@ define dso_local noundef <64 x i8> @_Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state ; CHECK-LABEL: _Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda p0, [p0, #0]; mov p3, p1 +; CHECK-NEXT: vlda lfl0, [p3], #128 +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -96,24 +96,24 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fi ; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov dj0, #128; nopv -; CHECK-NEXT: lda r24, [p1, dj0]; nopb ; nopx +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv +; CHECK-NEXT: vlda lfl0, [p3], #128; nopb ; nopx +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 ; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24, m0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -138,24 +138,24 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_2d_byteRPDv64_DB8_R12fi ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: lda p0, [p0, #0]; nopx -; CHECK-NEXT: lda dc0, [p2, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda dc0, [p2, #0]; mov p4, p1 +; CHECK-NEXT: vlda lfl0, [p4], #128 +; CHECK-NEXT: lda r24, [p4, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: movs p3, p0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 +; CHECK-NEXT: nop +; CHECK-NEXT: mov m0, r0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vldb.pop.512.2d x0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -188,24 +188,24 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_3d_byteRPDv64_DB8_R12fi ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: lda p0, [p0, #0]; nopxm ; CHECK-NEXT: lda dc0, [p2, #0] -; CHECK-NEXT: lda dc4, [p3, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda dc4, [p3, #0]; mov p5, p1 +; CHECK-NEXT: vlda lfl0, [p5], #128 +; CHECK-NEXT: lda r24, [p5, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: movs p4, p0 +; CHECK-NEXT: nop ; CHECK-NEXT: mov m0, r0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj4, r4 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj0, r2 +; CHECK-NEXT: movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj4, r4 ; CHECK-NEXT: vldb.pop.512.3d x0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p5, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -244,24 +244,24 @@ define dso_local %struct.v64bfp16ebs8 @_Z16test_fifo_ld_popRP22v64bfp16ebs8_unal ; CHECK-LABEL: _Z16test_fifo_ld_popRP22v64bfp16ebs8_unalignedR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda p0, [p0, #0]; mov p3, p1 +; CHECK-NEXT: vlda lfl0, [p3], #128 +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -287,24 +287,24 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRP22v64bfp16e ; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov dj0, #128; nopv -; CHECK-NEXT: lda r24, [p1, dj0]; nopb ; nopx +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv +; CHECK-NEXT: vlda lfl0, [p3], #128; nopb ; nopx +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24, m0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -332,24 +332,24 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRP22v64bfp16e ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: lda p0, [p0, #0]; nopx -; CHECK-NEXT: lda dc0, [p2, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda dc0, [p2, #0]; mov p4, p1 +; CHECK-NEXT: vlda lfl0, [p4], #128 +; CHECK-NEXT: lda r24, [p4, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: movs p3, p0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 +; CHECK-NEXT: nop +; CHECK-NEXT: mov m0, r0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vldb.pop.576.2d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -385,24 +385,24 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRP22v64bfp16e ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: lda p0, [p0, #0]; nopxm ; CHECK-NEXT: lda dc0, [p2, #0] -; CHECK-NEXT: lda dc4, [p3, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda dc4, [p3, #0]; mov p5, p1 +; CHECK-NEXT: vlda lfl0, [p5], #128 +; CHECK-NEXT: lda r24, [p5, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: movs p4, p0 +; CHECK-NEXT: nop ; CHECK-NEXT: mov m0, r0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj4, r4 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj0, r2 +; CHECK-NEXT: movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj4, r4 ; CHECK-NEXT: vldb.pop.576.3d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p5, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -444,24 +444,24 @@ define dso_local %struct.v64bfp16ebs16 @_Z16test_fifo_ld_popRP23v64bfp16ebs16_un ; CHECK-LABEL: _Z16test_fifo_ld_popRP23v64bfp16ebs16_unalignedR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda p0, [p0, #0]; mov p3, p1 +; CHECK-NEXT: vlda lfl0, [p3], #128 +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.544 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -487,24 +487,24 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_1d_byteRP23v64bfp16 ; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov dj0, #128; nopv -; CHECK-NEXT: lda r24, [p1, dj0]; nopb ; nopx +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv +; CHECK-NEXT: vlda lfl0, [p3], #128; nopb ; nopx +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 ; CHECK-NEXT: vldb.pop.544 ex0, [p0, lf0, r24, m0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -532,24 +532,24 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_2d_byteRP23v64bfp16 ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: lda p0, [p0, #0]; nopx -; CHECK-NEXT: lda dc0, [p2, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda dc0, [p2, #0]; mov p4, p1 +; CHECK-NEXT: vlda lfl0, [p4], #128 +; CHECK-NEXT: lda r24, [p4, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: movs p3, p0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 +; CHECK-NEXT: nop +; CHECK-NEXT: mov m0, r0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vldb.pop.544.2d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -585,24 +585,24 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_3d_byteRP23v64bfp16 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: lda p0, [p0, #0]; nopxm ; CHECK-NEXT: lda dc0, [p2, #0] -; CHECK-NEXT: lda dc4, [p3, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda dc4, [p3, #0]; mov p5, p1 +; CHECK-NEXT: vlda lfl0, [p5], #128 +; CHECK-NEXT: lda r24, [p5, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: movs p4, p0 +; CHECK-NEXT: nop ; CHECK-NEXT: mov m0, r0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj4, r4 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj0, r2 +; CHECK-NEXT: movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj4, r4 ; CHECK-NEXT: vldb.pop.544.3d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p5, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -643,13 +643,14 @@ define dso_local noundef <64 x i8> @_Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t ; CHECK-LABEL: _Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopx ; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda p0, [p0, #0]; nopx ; mov p3, p1 +; CHECK-NEXT: vlda lfl0, [p3], #128 +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0] -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 +; CHECK-NEXT: movs p2, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vlda lfe, [p1, #192]; movxm r30, #2015 ; CHECK-NEXT: vldb.popx.512 x0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -657,11 +658,10 @@ define dso_local noundef <64 x i8> @_Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: vst lfe, [p1, #192]; ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0]; ret lr +; CHECK-NEXT: vst lfe, [p1, #192] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -688,13 +688,14 @@ define dso_local void @_Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_s ; CHECK-LABEL: _Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_state_tii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0]; nopx +; CHECK-NEXT: lda p0, [p0, #0]; nopx ; mov p3, p1 +; CHECK-NEXT: vlda lfl0, [p3], #128 +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0] -; CHECK-NEXT: mova r2, #6 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0; lshl r0, r0, r2 +; CHECK-NEXT: nop +; CHECK-NEXT: mova r2, #6; movs p2, p0 +; CHECK-NEXT: vlda lfh0, [p1, #64]; lshl r0, r0, r2 ; CHECK-NEXT: vlda lfe, [p1, #192]; or r30, r0, r1 ; CHECK-NEXT: vldb.fillx.512 [p0, lf0, r24] ; CHECK-NEXT: nop @@ -702,11 +703,10 @@ define dso_local void @_Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_s ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: vst lfe, [p1, #192]; ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0]; ret lr +; CHECK-NEXT: vst lfe, [p1, #192] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -735,8 +735,8 @@ define dso_local void @_Z18test_fifo_ld_resetRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-LABEL: _Z18test_fifo_ld_resetRrP23v128bfp16ebs8_unalignedR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopxm ; nopv -; CHECK-NEXT: nopx +; CHECK-NEXT: lda p0, [p0, #0] +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vlda lfl0, [p1, #0] @@ -747,11 +747,10 @@ define dso_local void @_Z18test_fifo_ld_resetRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -773,24 +772,24 @@ define dso_local void @_Z17test_fifo_ld_fillRrP23v128bfp16ebs8_unalignedR12fifo_ ; CHECK-LABEL: _Z17test_fifo_ld_fillRrP23v128bfp16ebs8_unalignedR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda p0, [p0, #0]; mov p3, p1 +; CHECK-NEXT: vlda lfl0, [p3], #128 +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -812,13 +811,14 @@ define dso_local %struct.v128bfp16ebs8 @_Z16test_fifo_ld_popRrP23v128bfp16ebs8_u ; CHECK-LABEL: _Z16test_fifo_ld_popRrP23v128bfp16ebs8_unalignedR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov dj0, #128; nopv -; CHECK-NEXT: lda r24, [p1, dj0]; nopb ; nopx +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv +; CHECK-NEXT: vlda lfl0, [p3], #128; nopb ; nopx +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -826,11 +826,11 @@ define dso_local %struct.v128bfp16ebs8 @_Z16test_fifo_ld_popRrP23v128bfp16ebs8_u ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r24, [p1, dj0] +; CHECK-NEXT: st r24, [p3, #0] ; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] ; CHECK-NEXT: st p0, [p2, #0] -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -842,11 +842,10 @@ define dso_local %struct.v128bfp16ebs8 @_Z16test_fifo_ld_popRrP23v128bfp16ebs8_u ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -885,13 +884,14 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRrP23v128bfp ; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov dj0, #128; nopv -; CHECK-NEXT: lda r24, [p1, dj0]; nopx +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv +; CHECK-NEXT: vlda lfl0, [p3], #128; nopx +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -899,11 +899,11 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r24, [p1, dj0] +; CHECK-NEXT: st r24, [p3, #0] ; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] ; CHECK-NEXT: st p0, [p2, #0] -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda r24, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -915,11 +915,10 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -959,13 +958,14 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRrP23v128bfp ; CHECK-LABEL: _Z24test_fifo_ld_pop_2d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_tiiRii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda p0, [p0, #0]; mov p4, p1 +; CHECK-NEXT: vlda lfl0, [p4], #128 +; CHECK-NEXT: lda r24, [p4, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p3, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p3, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -973,11 +973,11 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r24, [p1, dj0] +; CHECK-NEXT: st r24, [p4, #0] ; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] ; CHECK-NEXT: st p0, [p3, #0] -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda r24, [p4, #0] ; CHECK-NEXT: lda dc0, [p2, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -990,11 +990,10 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -1041,13 +1040,14 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRrP23v128bfp ; CHECK-LABEL: _Z24test_fifo_ld_pop_3d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_tiiRiiiS5_i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopx ; mov dj0, #128 -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda p0, [p0, #0]; nopx ; mov p5, p1 +; CHECK-NEXT: vlda lfl0, [p5], #128 +; CHECK-NEXT: lda r24, [p5, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: movs p4, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p4, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -1055,29 +1055,28 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r24, [p1, dj0] +; CHECK-NEXT: st r24, [p5, #0] ; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] ; CHECK-NEXT: st p0, [p4, #0] -; CHECK-NEXT: lda r24, [p1, dj0] +; CHECK-NEXT: lda r24, [p5, #0] ; CHECK-NEXT: lda dc0, [p2, #0] ; CHECK-NEXT: lda dc4, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mov m0, r0 -; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj4, r4 -; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj0, r2 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj4, r4 ; CHECK-NEXT: vldb.pop.576.3d ex1, [p0, lf0, r24, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst lfl0, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r24, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r24, [p5, #0] // Delay Slot 5 +; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: diff --git a/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll index aa34a5a08b11..0dad1049c609 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll +++ b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll @@ -48,9 +48,9 @@ define dso_local void @_Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t(ptr no ; CHECK-LABEL: _Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops -; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p4, p1; nopv +; CHECK-NEXT: vlda sfl, [p4], #128 +; CHECK-NEXT: lda r26, [p4, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -59,9 +59,9 @@ define dso_local void @_Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t(ptr no ; CHECK-NEXT: nop ; CHECK-NEXT: vst.push.512 x0, [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -85,9 +85,9 @@ define dso_local void @_Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t(ptr noca ; CHECK-LABEL: _Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops -; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p4, p1; nopv +; CHECK-NEXT: vlda sfl, [p4], #128 +; CHECK-NEXT: lda r26, [p4, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -96,9 +96,9 @@ define dso_local void @_Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t(ptr noca ; CHECK-NEXT: nop ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -121,9 +121,9 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-LABEL: _Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm -; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov p4, p1; nops +; CHECK-NEXT: vlda sfl, [p4], #128 +; CHECK-NEXT: lda r26, [p4, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -132,9 +132,9 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-NEXT: mov m0, r0 ; CHECK-NEXT: vst.flush.512 [p2, sf, r26, m0] ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -158,23 +158,24 @@ define dso_local void @_Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-LABEL: _Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_tiiRii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p4, [p0, #0]; nopx -; CHECK-NEXT: lda dc0, [p2, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda p5, [p0, #0] +; CHECK-NEXT: lda dc0, [p2, #0]; mov p3, p1 +; CHECK-NEXT: vlda sfl, [p3], #128 +; CHECK-NEXT: lda r26, [p3, #0] +; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] -; CHECK-NEXT: movs p3, p2 +; CHECK-NEXT: mov p4, p2 ; CHECK-NEXT: movs m0, r0; mov dn0, r1 -; CHECK-NEXT: movs dj0, r2; mov p2, p4 +; CHECK-NEXT: movs dj0, r2; mov p2, p5 ; CHECK-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st dc0, [p3, #0]; ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st dc0, [p4, #0]; ret lr +; CHECK-NEXT: st r26, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -205,24 +206,24 @@ define dso_local void @_Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-LABEL: _Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopxm -; CHECK-NEXT: lda p5, [p0, #0] +; CHECK-NEXT: lda r5, [p0, #0]; nopxm ; CHECK-NEXT: lda dc0, [p2, #0] -; CHECK-NEXT: lda dc4, [p3, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda dc4, [p3, #0]; mov p4, p1 +; CHECK-NEXT: vlda sfl, [p4], #128 +; CHECK-NEXT: lda r26, [p4, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop -; CHECK-NEXT: movs p4, p2 +; CHECK-NEXT: mov p5, p2 ; CHECK-NEXT: movs m0, r0; mov dn0, r1 -; CHECK-NEXT: movs dn4, r3; mov dj4, r4 -; CHECK-NEXT: movs dj0, r2; mov p2, p5 +; CHECK-NEXT: movs dj0, r2; mov dn4, r3 +; CHECK-NEXT: movs dj4, r4; mov p2, r5 ; CHECK-NEXT: vst.flush.512.3d [p2, sf, r26, d0] ; CHECK-NEXT: nop -; CHECK-NEXT: st dc0, [p4, #0] +; CHECK-NEXT: st dc0, [p5, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -260,10 +261,11 @@ define dso_local void @_Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t(ptr ; CHECK-LABEL: _Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128; nops -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov p4, p1 +; CHECK-NEXT: vlda sfl, [p4], #128 +; CHECK-NEXT: lda r26, [p4, #0] +; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -272,9 +274,9 @@ define dso_local void @_Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t(ptr ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -297,10 +299,11 @@ define dso_local void @_Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_sta ; CHECK-LABEL: _Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda p2, [p0, #0]; nopx ; mov p4, p1 +; CHECK-NEXT: vlda sfl, [p4], #128 +; CHECK-NEXT: lda r26, [p4, #0] +; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -309,9 +312,9 @@ define dso_local void @_Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_sta ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -335,22 +338,22 @@ define dso_local void @_Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_sta ; CHECK-LABEL: _Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_state_tiiRii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopx -; CHECK-NEXT: lda p4, [p0, #0] -; CHECK-NEXT: lda dc0, [p2, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda p5, [p0, #0]; nopb ; nopx +; CHECK-NEXT: lda dc0, [p2, #0]; mov p3, p1 +; CHECK-NEXT: vlda sfl, [p3], #128 +; CHECK-NEXT: lda r26, [p3, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p3, p2 +; CHECK-NEXT: mov p4, p2 ; CHECK-NEXT: movs m0, r0; mov dn0, r1 -; CHECK-NEXT: movs dj0, r2; mov p2, p4 +; CHECK-NEXT: movs dj0, r2; mov p2, p5 ; CHECK-NEXT: vst.flush.512.2d [p2, sf, r26, d0] ; CHECK-NEXT: nop -; CHECK-NEXT: st dc0, [p3, #0]; ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st dc0, [p4, #0]; ret lr +; CHECK-NEXT: st r26, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -381,25 +384,26 @@ define dso_local void @_Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_sta ; CHECK-LABEL: _Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p5, [p0, #0]; nopb ; nopx +; CHECK-NEXT: lda r5, [p0, #0]; nopx ; CHECK-NEXT: lda dc0, [p2, #0] -; CHECK-NEXT: lda dc4, [p3, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda dc4, [p3, #0]; mov p4, p1 +; CHECK-NEXT: vlda sfl, [p4], #128 +; CHECK-NEXT: lda r26, [p4, #0] ; CHECK-NEXT: nop -; CHECK-NEXT: vlda sfl, [p1, #0] -; CHECK-NEXT: vlda sfh, [p1, #64]; movs p4, p2 +; CHECK-NEXT: nop +; CHECK-NEXT: vlda sfh, [p1, #64]; mov p5, p2 ; CHECK-NEXT: movs m0, r0; mov dn0, r1 -; CHECK-NEXT: movs dn4, r3; mov dj4, r4 -; CHECK-NEXT: movs dj0, r2; mov p2, p5 +; CHECK-NEXT: movs dj0, r2; mov dn4, r3 +; CHECK-NEXT: movs dj4, r4; mov p2, r5 ; CHECK-NEXT: vst.flush.512.conv.3d [p2, sf, r26, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st dc0, [p4, #0] +; CHECK-NEXT: st dc0, [p5, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -470,9 +474,9 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs16(ptr nocapture nonnull ali ; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops -; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p4, p1; nopv +; CHECK-NEXT: vlda sfl, [p4], #128 +; CHECK-NEXT: lda r26, [p4, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -481,9 +485,9 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs16(ptr nocapture nonnull ali ; CHECK-NEXT: nop ; CHECK-NEXT: vst.push.544 ex0, [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -540,9 +544,9 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs8(ptr nocapture nonnull alig ; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs8: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops -; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p4, p1; nopv +; CHECK-NEXT: vlda sfl, [p4], #128 +; CHECK-NEXT: lda r26, [p4, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -551,9 +555,9 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs8(ptr nocapture nonnull alig ; CHECK-NEXT: nop ; CHECK-NEXT: vst.push.576 ex0, [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -578,9 +582,9 @@ define dso_local void @_Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bf ; CHECK-LABEL: _Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bfp16ebs8R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv -; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0]; nopx +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv +; CHECK-NEXT: vlda sfl, [p3], #128; nopb ; nopx +; CHECK-NEXT: lda r26, [p3, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -589,12 +593,12 @@ define dso_local void @_Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bf ; CHECK-NEXT: nop ; CHECK-NEXT: vst.push.576 ex0, [p2, sf, r26] ; CHECK-NEXT: nop -; CHECK-NEXT: st r26, [p1, dj0] +; CHECK-NEXT: st r26, [p3, #0] ; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: vst sfh, [p1, #64] ; CHECK-NEXT: st p2, [p0, #0] ; CHECK-NEXT: vlda sfl, [p1, #0] -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda r26, [p3, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -603,9 +607,9 @@ define dso_local void @_Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bf ; CHECK-NEXT: nop ; CHECK-NEXT: vst.push.576 ex1, [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -640,9 +644,9 @@ define dso_local void @_Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-LABEL: _Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv -; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0]; nopx +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv +; CHECK-NEXT: vlda sfl, [p3], #128; nopb ; nopx +; CHECK-NEXT: lda r26, [p3, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -651,12 +655,12 @@ define dso_local void @_Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-NEXT: nop ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: nop -; CHECK-NEXT: st r26, [p1, dj0] +; CHECK-NEXT: st r26, [p3, #0] ; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: vst sfh, [p1, #64] ; CHECK-NEXT: st p2, [p0, #0] ; CHECK-NEXT: vlda sfl, [p1, #0] -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda r26, [p3, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -665,9 +669,9 @@ define dso_local void @_Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-NEXT: nop ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -698,9 +702,9 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unaligne ; CHECK-LABEL: _Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv -; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv +; CHECK-NEXT: vlda sfl, [p3], #128; nopx +; CHECK-NEXT: lda r26, [p3, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -709,12 +713,12 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unaligne ; CHECK-NEXT: nop ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: nop -; CHECK-NEXT: st r26, [p1, dj0] +; CHECK-NEXT: st r26, [p3, #0] ; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: vst sfh, [p1, #64] ; CHECK-NEXT: st p2, [p0, #0] ; CHECK-NEXT: vlda sfl, [p1, #0] -; CHECK-NEXT: lda r26, [p1, dj0] +; CHECK-NEXT: lda r26, [p3, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -723,9 +727,9 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unaligne ; CHECK-NEXT: mov m0, r0 ; CHECK-NEXT: vst.flush.512 [p2, sf, r26, m0] ; CHECK-NEXT: ret lr -; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 5 -; CHECK-NEXT: vst sfl, [p1], #128 // Delay Slot 4 -; CHECK-NEXT: st r26, [p1, #0] // Delay Slot 3 +; CHECK-NEXT: st r26, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 +; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: diff --git a/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll b/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll index ff4d4f5dc8cd..9f87c2e8bf19 100644 --- a/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll +++ b/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll @@ -175,6 +175,7 @@ ; AIE-O1-NEXT: MachineDominator Tree Construction ; AIE-O1-NEXT: AIE2P Post Legalizer Generic Combiner ; AIE-O1-NEXT: AIE Base Address Clustering Optimization +; AIE-O1-NEXT: AIE Pointer Modifier Optimization ; AIE-O1-NEXT: AIE2P Post Legalizer Custom Combiner ; AIE-O1-NEXT: RegBankSelect ; AIE-O1-NEXT: Analysis for ComputingKnownBits @@ -376,6 +377,7 @@ ; AIE-O23-NEXT: MachineDominator Tree Construction ; AIE-O23-NEXT: AIE2P Post Legalizer Generic Combiner ; AIE-O23-NEXT: AIE Base Address Clustering Optimization +; AIE-O23-NEXT: AIE Pointer Modifier Optimization ; AIE-O23-NEXT: AIE2P Post Legalizer Custom Combiner ; AIE-O23-NEXT: RegBankSelect ; AIE-O23-NEXT: Analysis for ComputingKnownBits diff --git a/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll b/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll index 63f0c74d7687..51bd16ec3b22 100644 --- a/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll +++ b/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll @@ -16,693 +16,760 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-LABEL: test_load_store_unaligned: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova m0, #-560 -; CHECK-NEXT: paddxm [sp], #576 -; CHECK-NEXT: st p7, [sp, #-576] // 4-byte Folded Spill -; CHECK-NEXT: mov p7, sp -; CHECK-NEXT: padda [p7], m0 -; CHECK-NEXT: st.s16 r0, [p7, #0] +; CHECK-NEXT: mova m0, #-600; nopx +; CHECK-NEXT: paddxm [sp], #640 +; CHECK-NEXT: mov p1, sp +; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: mov p2, sp +; CHECK-NEXT: mov p4, sp +; CHECK-NEXT: mov p3, sp +; CHECK-NEXT: mov p5, sp +; CHECK-NEXT: st p6, [sp, #-636] // 4-byte Folded Spill +; CHECK-NEXT: mov p6, sp +; CHECK-NEXT: st r8, [sp, #-604] // 4-byte Folded Spill +; CHECK-NEXT: padda [p1], m0 +; CHECK-NEXT: mova m0, #-584 +; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: mova m0, #-568 +; CHECK-NEXT: mov r16, p0 +; CHECK-NEXT: padda [p2], m0 +; CHECK-NEXT: mova m0, #-544 +; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: padda [p4], m0 +; CHECK-NEXT: mova m0, #-480 +; CHECK-NEXT: padda [p0], #-512 +; CHECK-NEXT: mov r8, p0 +; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: mova m0, #-304 +; CHECK-NEXT: mov r25, p0 +; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: padda [p0], #-448 +; CHECK-NEXT: mov r24, p0 +; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: padda [p0], #-320 +; CHECK-NEXT: mov r27, p0 +; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: mova m0, #-288 +; CHECK-NEXT: mov r26, p0 +; CHECK-NEXT: padda [p3], m0 +; CHECK-NEXT: mova m0, #-272 +; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: padda [p5], m0 +; CHECK-NEXT: mova m0, #-240 +; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: mova m0, #-208 +; CHECK-NEXT: mov r29, p0 +; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: mova m0, #-176 +; CHECK-NEXT: st p0, [sp, #-44] // 4-byte Folded Spill +; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: padda [p6], m0 +; CHECK-NEXT: mova m0, #-112 +; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: st p0, [sp, #-48] // 4-byte Folded Spill +; CHECK-NEXT: mov p0, p1 +; CHECK-NEXT: mov p1, p0 +; CHECK-NEXT: st.s16 r0, [p1], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.32 r0, x0, #0, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r1, [p7, #2] +; CHECK-NEXT: st.s16 r1, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.32 r1, x0, #1, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r2, [p7, #4] +; CHECK-NEXT: st.s16 r2, [p0, #4] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.32 r2, x0, #2, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r3, [p7, #6] +; CHECK-NEXT: st.s16 r3, [p0, #6] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.32 r3, x0, #3, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r4, [p7, #8] +; CHECK-NEXT: st.s16 r4, [p0, #8] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.32 r4, x0, #4, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r5, [p7, #10] +; CHECK-NEXT: st.s16 r5, [p0, #10] +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vmov q0, wl2 ; CHECK-NEXT: vextract.32 r5, x0, #5, vaddsign1 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st.s16 r6, [p0, #12] +; CHECK-NEXT: nop ; CHECK-NEXT: vextract.32 r6, x0, #6, vaddsign1 +; CHECK-NEXT: vmov q0, wl2 ; CHECK-NEXT: vextract.32 r7, x0, #7, vaddsign1 -; CHECK-NEXT: st.s16 r6, [p7, #12] -; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: vmov wl0, q0 -; CHECK-NEXT: mova m0, #-544 -; CHECK-NEXT: padda [p0], m0 -; CHECK-NEXT: mova m0, #-528 -; CHECK-NEXT: mov r27, p7 -; CHECK-NEXT: st.s16 r7, [p7, #14] -; CHECK-NEXT: mov r16, p0 -; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: mov r28, p0 +; CHECK-NEXT: st.s16 r7, [p0, #14] +; CHECK-NEXT: mov r31, p1 +; CHECK-NEXT: mov p1, r16 ; CHECK-NEXT: vextract.32 r0, x0, #0, vaddsign1 ; CHECK-NEXT: vextract.32 r1, x0, #1, vaddsign1 -; CHECK-NEXT: mov p7, r16 -; CHECK-NEXT: st r1, [p7, #4] -; CHECK-NEXT: st r0, [p7, #0] -; CHECK-NEXT: st.s8 r0, [p0, #0] +; CHECK-NEXT: mov p0, p1 +; CHECK-NEXT: vextract.32 r2, x0, #2, vaddsign1 +; CHECK-NEXT: st r0, [p0], #4 +; CHECK-NEXT: vextract.32 r3, x0, #3, vaddsign1 +; CHECK-NEXT: st r2, [p1, #8] +; CHECK-NEXT: st r1, [p0, #0] +; CHECK-NEXT: mov r30, p0 +; CHECK-NEXT: mov p0, p2 +; CHECK-NEXT: st r3, [p1, #12] +; CHECK-NEXT: st.s8 r0, [p0], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r0, x4, #0, vaddsign1 -; CHECK-NEXT: vextract.32 r2, x0, #2, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: st r2, [p7, #8] -; CHECK-NEXT: st.s8 r1, [p0, #1] +; CHECK-NEXT: nop +; CHECK-NEXT: st.s8 r1, [p0, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r1, x4, #1, vaddsign1 -; CHECK-NEXT: vextract.32 r3, x0, #3, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: st r3, [p7, #12] -; CHECK-NEXT: st.s8 r2, [p0, #2] +; CHECK-NEXT: nop +; CHECK-NEXT: st.s8 r2, [p2, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r2, x4, #2, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r3, [p0, #3] +; CHECK-NEXT: st.s8 r3, [p2, #3] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r3, x4, #3, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r4, [p0, #4] +; CHECK-NEXT: st.s8 r4, [p2, #4] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r4, x4, #4, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r8, [sp, #-568] // 4-byte Folded Spill -; CHECK-NEXT: st p6, [sp, #-572] // 4-byte Folded Spill -; CHECK-NEXT: st.s8 r5, [p0, #5] +; CHECK-NEXT: st.s8 r5, [p2, #5] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r5, x4, #5, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r6, [p0, #6] +; CHECK-NEXT: st.s8 r6, [p2, #6] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r6, x4, #6, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r7, [p0, #7] +; CHECK-NEXT: st.s8 r7, [p2, #7] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r7, x4, #7, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj0, #8 -; CHECK-NEXT: st.s8 r16, [p0, dj0] +; CHECK-NEXT: st.s8 r16, [p2, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r16, x4, #8, vaddsign1 ; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st r9, [sp, #-608] // 4-byte Folded Spill +; CHECK-NEXT: st r10, [sp, #-612] // 4-byte Folded Spill +; CHECK-NEXT: st p7, [sp, #-640] // 4-byte Folded Spill +; CHECK-NEXT: st r11, [sp, #-616] // 4-byte Folded Spill +; CHECK-NEXT: st r12, [sp, #-620] // 4-byte Folded Spill +; CHECK-NEXT: st r13, [sp, #-624] // 4-byte Folded Spill +; CHECK-NEXT: st r14, [sp, #-628] // 4-byte Folded Spill +; CHECK-NEXT: st r15, [sp, #-632] // 4-byte Folded Spill ; CHECK-NEXT: mova dj0, #9 -; CHECK-NEXT: st.s8 r17, [p0, dj0] +; CHECK-NEXT: st.s8 r17, [p2, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r17, x4, #9, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj2, #10 -; CHECK-NEXT: st.s8 r18, [p0, dj2] +; CHECK-NEXT: st.s8 r18, [p2, dj2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r18, x4, #10, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj0, #11 -; CHECK-NEXT: st.s8 r19, [p0, dj0] +; CHECK-NEXT: st.s8 r19, [p2, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r19, x4, #11, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj1, #12 -; CHECK-NEXT: st.s8 r20, [p0, dj1] +; CHECK-NEXT: st.s8 r20, [p2, dj1] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r20, x4, #12, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj0, #13 -; CHECK-NEXT: st.s8 r21, [p0, dj0] +; CHECK-NEXT: st.s8 r21, [p2, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r21, x4, #13, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj3, #14 -; CHECK-NEXT: st.s8 r22, [p0, dj3] +; CHECK-NEXT: st.s8 r22, [p2, dj3] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r22, x4, #14, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj0, #15 -; CHECK-NEXT: st.s8 r23, [p0, dj0] +; CHECK-NEXT: st.s8 r23, [p2, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r23, x4, #15, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: mov p2, sp -; CHECK-NEXT: padda [p2], #-512 -; CHECK-NEXT: st.s16 r0, [p2, #0] +; CHECK-NEXT: mov r12, p0 +; CHECK-NEXT: mov p0, p4 +; CHECK-NEXT: st.s16 r0, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r0, x6, #0, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r1, [p2, #2] +; CHECK-NEXT: st.s16 r1, [p0, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r1, x6, #1, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r2, [p2, #4] +; CHECK-NEXT: st.s16 r2, [p4, #4] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r2, x6, #2, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r3, [p2, #6] +; CHECK-NEXT: st.s16 r3, [p4, #6] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r3, x6, #3, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: mova dj4, #32 -; CHECK-NEXT: st.s16 r4, [p2, #8] -; CHECK-NEXT: mova dj5, #36 -; CHECK-NEXT: mova dj6, #40 -; CHECK-NEXT: mova dj7, #44 +; CHECK-NEXT: nop +; CHECK-NEXT: st.s16 r4, [p4, #8] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r4, x6, #4, vaddsign1 -; CHECK-NEXT: vmov x2, bmll0 -; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: st.s16 r5, [p2, #10] -; CHECK-NEXT: mov p5, sp -; CHECK-NEXT: vmov bmll0, x2 -; CHECK-NEXT: mova m0, #-480 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st.s16 r5, [p4, #10] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r5, x6, #5, vaddsign1 -; CHECK-NEXT: padda [p1], m0 -; CHECK-NEXT: mova m0, #-416 -; CHECK-NEXT: st.s16 r6, [p2, #12] -; CHECK-NEXT: mov r30, p1 -; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: mov r29, p7 +; CHECK-NEXT: nop +; CHECK-NEXT: mov r9, p1 +; CHECK-NEXT: st.s16 r6, [p4, #12] ; CHECK-NEXT: vextract.16 r6, x6, #6, vaddsign1 +; CHECK-NEXT: nop +; CHECK-NEXT: mov r13, p0 +; CHECK-NEXT: mov p0, r8 ; CHECK-NEXT: vextract.32 r0, x8, #0, vaddsign1 -; CHECK-NEXT: mov p7, r30 -; CHECK-NEXT: st r0, [p7, #0] -; CHECK-NEXT: st.s16 r7, [p2, #14] -; CHECK-NEXT: padda [p1], #-448 -; CHECK-NEXT: mov r31, p1 -; CHECK-NEXT: mov p1, sp +; CHECK-NEXT: mov p1, p0 +; CHECK-NEXT: st r0, [p1], #4 +; CHECK-NEXT: st.s16 r7, [p4, #14] +; CHECK-NEXT: mova dj4, #32 +; CHECK-NEXT: mova dj5, #36 +; CHECK-NEXT: mova dj6, #40 ; CHECK-NEXT: vextract.16 r7, x6, #7, vaddsign1 ; CHECK-NEXT: vextract.32 r1, x8, #1, vaddsign1 ; CHECK-NEXT: mova dj0, #16 -; CHECK-NEXT: st r1, [p7, #4] -; CHECK-NEXT: st.s16 r16, [p2, dj0] -; CHECK-NEXT: padda [p1], m0 -; CHECK-NEXT: mova m0, #-352 -; CHECK-NEXT: mov r8, p1 +; CHECK-NEXT: st r1, [p1, #0] +; CHECK-NEXT: st.s16 r16, [p4, dj0] +; CHECK-NEXT: mova dj7, #44 +; CHECK-NEXT: vmov x2, bmll0 +; CHECK-NEXT: mov p7, sp ; CHECK-NEXT: vextract.16 r16, x6, #8, vaddsign1 -; CHECK-NEXT: vmov x0, bmll0 +; CHECK-NEXT: vmov bmll0, x2 ; CHECK-NEXT: mova dj0, #18 -; CHECK-NEXT: st.s16 r17, [p2, dj0] -; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: padda [p5], m0 +; CHECK-NEXT: st.s16 r17, [p4, dj0] +; CHECK-NEXT: padda [p7], #-384 +; CHECK-NEXT: vmov x0, bmll0 ; CHECK-NEXT: vextract.16 r17, x6, #9, vaddsign1 ; CHECK-NEXT: vextract.32 r2, x8, #2, vaddsign1 ; CHECK-NEXT: vextract.32 r3, x8, #3, vaddsign1 ; CHECK-NEXT: mova dj0, #20 -; CHECK-NEXT: st r2, [p7, #8] -; CHECK-NEXT: st r3, [p7, #12] -; CHECK-NEXT: st.s16 r18, [p2, dj0] -; CHECK-NEXT: mova m0, #-288 -; CHECK-NEXT: padda [p1], m0 -; CHECK-NEXT: mov r28, p1 -; CHECK-NEXT: vextract.16 r18, x6, #10, vaddsign1 +; CHECK-NEXT: st r2, [p0, #8] +; CHECK-NEXT: st r3, [p0, #12] +; CHECK-NEXT: st.s16 r18, [p4, dj0] ; CHECK-NEXT: vmov bmll0, x2 -; CHECK-NEXT: mova dj0, #22 -; CHECK-NEXT: st.s16 r19, [p2, dj0] -; CHECK-NEXT: mov r30, p7 -; CHECK-NEXT: mova dj2, #60 +; CHECK-NEXT: vextract.16 r18, x6, #10, vaddsign1 ; CHECK-NEXT: vextract.16 r19, x6, #11, vaddsign1 +; CHECK-NEXT: mova dj2, #60 +; CHECK-NEXT: mov r8, p7 +; CHECK-NEXT: mova dj0, #22 +; CHECK-NEXT: st.s16 r19, [p4, dj0] +; CHECK-NEXT: vextract.16 r20, x6, #12, vaddsign1 +; CHECK-NEXT: mova dj1, #48 +; CHECK-NEXT: mov r10, p0 ; CHECK-NEXT: vextract.32 r4, x8, #4, vaddsign1 ; CHECK-NEXT: vextract.32 r5, x8, #5, vaddsign1 ; CHECK-NEXT: mova dj0, #24 -; CHECK-NEXT: st r4, [p7, #16] -; CHECK-NEXT: st r5, [p7, #20] -; CHECK-NEXT: st.s16 r20, [p2, dj0] -; CHECK-NEXT: mova dj1, #48 -; CHECK-NEXT: mova dj3, #52 -; CHECK-NEXT: vextract.16 r20, x6, #12, vaddsign1 +; CHECK-NEXT: st r4, [p0, #16] +; CHECK-NEXT: st r5, [p0, #20] +; CHECK-NEXT: st.s16 r20, [p4, dj0] ; CHECK-NEXT: vextract.16 r21, x6, #13, vaddsign1 ; CHECK-NEXT: vextract.16 r22, x6, #14, vaddsign1 -; CHECK-NEXT: mova dj0, #26 -; CHECK-NEXT: st.s16 r21, [p2, dj0] ; CHECK-NEXT: vextract.16 r23, x6, #15, vaddsign1 +; CHECK-NEXT: mova dj3, #52 +; CHECK-NEXT: mov r11, p1 +; CHECK-NEXT: mova dj0, #26 +; CHECK-NEXT: st.s16 r21, [p4, dj0] ; CHECK-NEXT: vextract.64 r1:r0, x0, #0, vaddsign1 ; CHECK-NEXT: vmov x0, bmll0 +; CHECK-NEXT: vmov bmll0, x2 ; CHECK-NEXT: vextract.32 r6, x8, #6, vaddsign1 ; CHECK-NEXT: vextract.32 r7, x8, #7, vaddsign1 ; CHECK-NEXT: mova dj0, #28 -; CHECK-NEXT: st r6, [p7, #24] -; CHECK-NEXT: st r7, [p7, #28] -; CHECK-NEXT: st.s16 r22, [p2, dj0] -; CHECK-NEXT: vmov bmll0, x2 +; CHECK-NEXT: st r6, [p0, #24] +; CHECK-NEXT: st r7, [p0, #28] +; CHECK-NEXT: st.s16 r22, [p4, dj0] ; CHECK-NEXT: vextract.64 r3:r2, x0, #1, vaddsign1 ; CHECK-NEXT: vmov x0, bmll0 ; CHECK-NEXT: vmov bmll0, x2 ; CHECK-NEXT: vextract.64 r5:r4, x0, #2, vaddsign1 -; CHECK-NEXT: mova dj0, #30 -; CHECK-NEXT: st.s16 r23, [p2, dj0] ; CHECK-NEXT: vmov x0, bmll0 -; CHECK-NEXT: mov p7, r31 +; CHECK-NEXT: mova dj0, #30 +; CHECK-NEXT: st.s16 r23, [p4, dj0] +; CHECK-NEXT: mov p0, r25 ; CHECK-NEXT: vextract.64 r7:r6, x0, #3, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: mov r31, p7 +; CHECK-NEXT: mov p1, p0 +; CHECK-NEXT: mov r15, p0 ; CHECK-NEXT: mova dj0, #36 -; CHECK-NEXT: st r0, [p7, #0] -; CHECK-NEXT: st r1, [p7, #4] -; CHECK-NEXT: st r2, [p7, #8] -; CHECK-NEXT: st r3, [p7, #12] -; CHECK-NEXT: st r4, [p7, #16] -; CHECK-NEXT: st r5, [p7, #20] -; CHECK-NEXT: vextract.64 r5:r4, x0, #0, vaddsign1 -; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r6, [p7, #24] -; CHECK-NEXT: st r7, [p7, #28] -; CHECK-NEXT: mov p7, r8 -; CHECK-NEXT: vextract.64 r7:r6, x0, #1, vaddsign1 +; CHECK-NEXT: st r1, [p0, #4] +; CHECK-NEXT: st r3, [p0, #12] +; CHECK-NEXT: st r4, [p0, #16] +; CHECK-NEXT: st r5, [p0, #20] +; CHECK-NEXT: st r6, [p0, #24] +; CHECK-NEXT: st r7, [p0, #28] +; CHECK-NEXT: mov p0, r24 +; CHECK-NEXT: st r0, [p1], #8 +; CHECK-NEXT: vextract.64 r7:r6, x0, #0, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: mov r8, p7 -; CHECK-NEXT: vextract.64 r17:r16, x0, #2, vaddsign1 +; CHECK-NEXT: mov r14, p1 +; CHECK-NEXT: st r2, [p1, #0] +; CHECK-NEXT: mov p1, p0 +; CHECK-NEXT: vextract.64 r17:r16, x0, #1, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r4, [p7, #0] -; CHECK-NEXT: st r5, [p7, #4] -; CHECK-NEXT: vextract.32 r4, x10, #4, vaddsign1 -; CHECK-NEXT: vextract.32 r5, x10, #5, vaddsign1 -; CHECK-NEXT: vextract.64 r19:r18, x0, #3, vaddsign1 +; CHECK-NEXT: vextract.64 r19:r18, x0, #2, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r6, [p7, #8] -; CHECK-NEXT: st r7, [p7, #12] +; CHECK-NEXT: st r6, [p1], #8 ; CHECK-NEXT: vextract.32 r6, x10, #6, vaddsign1 ; CHECK-NEXT: vextract.32 r7, x10, #7, vaddsign1 -; CHECK-NEXT: vextract.64 r21:r20, x0, #4, vaddsign1 +; CHECK-NEXT: st r7, [p0, #4] +; CHECK-NEXT: mov r24, p1 +; CHECK-NEXT: st r16, [p1, #0] +; CHECK-NEXT: mov p1, p0 +; CHECK-NEXT: vextract.64 r21:r20, x0, #3, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r16, [p7, #16] -; CHECK-NEXT: st r17, [p7, #20] +; CHECK-NEXT: st r17, [p0, #12] ; CHECK-NEXT: vextract.32 r16, x10, #8, vaddsign1 ; CHECK-NEXT: vextract.32 r17, x10, #9, vaddsign1 -; CHECK-NEXT: st r4, [p5, #16] -; CHECK-NEXT: st r5, [p5, #20] -; CHECK-NEXT: vextract.64 r23:r22, x0, #5, vaddsign1 +; CHECK-NEXT: vextract.64 r23:r22, x0, #4, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r18, [p7, #24] -; CHECK-NEXT: st r19, [p7, #28] +; CHECK-NEXT: st r18, [p0, #16] +; CHECK-NEXT: st r19, [p0, #20] ; CHECK-NEXT: vextract.32 r18, x10, #10, vaddsign1 ; CHECK-NEXT: vextract.32 r19, x10, #11, vaddsign1 -; CHECK-NEXT: st r6, [p5, #24] -; CHECK-NEXT: st r7, [p5, #28] -; CHECK-NEXT: st r21, [p7, dj0] -; CHECK-NEXT: mova dj0, #56 -; CHECK-NEXT: st r17, [p5, dj5] +; CHECK-NEXT: st r6, [p7, #24] +; CHECK-NEXT: st r7, [p7, #28] +; CHECK-NEXT: st r17, [p7, dj5] ; CHECK-NEXT: mova dj5, #8 -; CHECK-NEXT: vextract.64 r3:r2, x0, #6, vaddsign1 +; CHECK-NEXT: vextract.64 r5:r4, x0, #5, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r20, [p7, dj4] +; CHECK-NEXT: st r20, [p0, #24] +; CHECK-NEXT: st r21, [p0, #28] ; CHECK-NEXT: vextract.32 r20, x10, #12, vaddsign1 ; CHECK-NEXT: vextract.32 r21, x10, #13, vaddsign1 -; CHECK-NEXT: st r16, [p5, dj4] -; CHECK-NEXT: lda.s8 r16, [p0, dj5] +; CHECK-NEXT: st r16, [p7, dj4] +; CHECK-NEXT: st r23, [p0, dj0] +; CHECK-NEXT: mova dj0, #56 +; CHECK-NEXT: lda.s8 r16, [p2, dj5] ; CHECK-NEXT: mova dj5, #9 -; CHECK-NEXT: vextract.64 r1:r0, x0, #7, vaddsign1 -; CHECK-NEXT: st r22, [p7, dj6] -; CHECK-NEXT: st r23, [p7, dj7] +; CHECK-NEXT: vextract.64 r3:r2, x0, #6, vaddsign1 +; CHECK-NEXT: vmov x0, bmll1 +; CHECK-NEXT: st r22, [p0, dj4] ; CHECK-NEXT: vextract.32 r22, x10, #14, vaddsign1 ; CHECK-NEXT: vextract.32 r23, x10, #15, vaddsign1 -; CHECK-NEXT: st r18, [p5, dj6] -; CHECK-NEXT: st r19, [p5, dj7] -; CHECK-NEXT: lda.s8 r17, [p0, dj5] +; CHECK-NEXT: st r18, [p7, dj6] +; CHECK-NEXT: st r19, [p7, dj7] +; CHECK-NEXT: lda.s8 r17, [p2, dj5] ; CHECK-NEXT: mova dj5, #10 -; CHECK-NEXT: st r2, [p7, dj1] -; CHECK-NEXT: st r3, [p7, dj3] +; CHECK-NEXT: vextract.64 r1:r0, x0, #7, vaddsign1 +; CHECK-NEXT: st r4, [p0, dj6] +; CHECK-NEXT: st r5, [p0, dj7] +; CHECK-NEXT: vextract.32 r4, x10, #4, vaddsign1 +; CHECK-NEXT: vextract.32 r5, x10, #5, vaddsign1 +; CHECK-NEXT: st r20, [p7, dj1] +; CHECK-NEXT: st r21, [p7, dj3] +; CHECK-NEXT: lda.s8 r18, [p2, dj5] +; CHECK-NEXT: mova dj5, #11 +; CHECK-NEXT: st r2, [p0, dj1] +; CHECK-NEXT: st r3, [p0, dj3] ; CHECK-NEXT: vextract.32 r2, x10, #2, vaddsign1 ; CHECK-NEXT: vextract.32 r3, x10, #3, vaddsign1 -; CHECK-NEXT: st r20, [p5, dj1] -; CHECK-NEXT: st r21, [p5, dj3] -; CHECK-NEXT: st r0, [p7, dj0] -; CHECK-NEXT: st r1, [p7, dj2] -; CHECK-NEXT: mov p7, r27 -; CHECK-NEXT: lda.s8 r18, [p0, dj5] -; CHECK-NEXT: mova dj5, #11 +; CHECK-NEXT: st r22, [p7, dj0] +; CHECK-NEXT: st r23, [p7, dj2] +; CHECK-NEXT: st r0, [p0, dj0] +; CHECK-NEXT: st r1, [p0, dj2] +; CHECK-NEXT: mov p0, p7 +; CHECK-NEXT: lda.s8 r19, [p2, dj5] +; CHECK-NEXT: mova dj5, #12 ; CHECK-NEXT: vextract.32 r0, x10, #0, vaddsign1 ; CHECK-NEXT: vextract.32 r1, x10, #1, vaddsign1 -; CHECK-NEXT: st r22, [p5, dj0] -; CHECK-NEXT: st r23, [p5, dj2] -; CHECK-NEXT: lda.s8 r19, [p0, dj5] -; CHECK-NEXT: mova dj5, #12 -; CHECK-NEXT: st r2, [p5, #8] -; CHECK-NEXT: st r3, [p5, #12] -; CHECK-NEXT: lda.s8 r20, [p0, dj5] +; CHECK-NEXT: st r4, [p7, #16] +; CHECK-NEXT: st r5, [p7, #20] +; CHECK-NEXT: st r2, [p7, #8] +; CHECK-NEXT: st r3, [p7, #12] +; CHECK-NEXT: mov p7, r31 +; CHECK-NEXT: lda.s8 r20, [p2, dj5] ; CHECK-NEXT: mova dj5, #13 -; CHECK-NEXT: st r0, [p5, #0] -; CHECK-NEXT: st r1, [p5, #4] -; CHECK-NEXT: lda.s16 r0, [p7], #2 -; CHECK-NEXT: lda.s16 r1, [p7, #0] -; CHECK-NEXT: mov p7, r27 -; CHECK-NEXT: lda.s8 r21, [p0, dj5] +; CHECK-NEXT: lda.s8 r21, [p2, dj5] ; CHECK-NEXT: mova dj5, #14 -; CHECK-NEXT: lda.s16 r2, [p7, #4] -; CHECK-NEXT: lda.s16 r3, [p7, #6] -; CHECK-NEXT: lda.s16 r4, [p7, #8] -; CHECK-NEXT: lda.s16 r5, [p7, #10] -; CHECK-NEXT: lda.s16 r6, [p7, #12] -; CHECK-NEXT: lda.s16 r7, [p7, #14] -; CHECK-NEXT: mov p7, r28 -; CHECK-NEXT: lda.s8 r22, [p0, dj5] +; CHECK-NEXT: st r0, [p0], #4 +; CHECK-NEXT: mov r25, p0 +; CHECK-NEXT: st r1, [p0, #0] +; CHECK-NEXT: mov p0, r28 +; CHECK-NEXT: lda.s16 r1, [p7, #0] +; CHECK-NEXT: mov p7, r9 +; CHECK-NEXT: lda.s8 r22, [p2, dj5] ; CHECK-NEXT: mova dj5, #15 -; CHECK-NEXT: lda.s8 r23, [p0, dj5] -; CHECK-NEXT: st.s16 r0, [p7], #2 +; CHECK-NEXT: lda r9, [sp, #-608] // 4-byte Folded Reload +; CHECK-NEXT: lda.s16 r0, [p0, #0] +; CHECK-NEXT: lda.s16 r2, [p0, #4] +; CHECK-NEXT: lda.s16 r3, [p0, #6] +; CHECK-NEXT: lda.s16 r4, [p0, #8] +; CHECK-NEXT: lda.s16 r5, [p0, #10] +; CHECK-NEXT: lda.s16 r6, [p0, #12] +; CHECK-NEXT: lda.s16 r7, [p0, #14] +; CHECK-NEXT: mov p0, r27 +; CHECK-NEXT: lda.s8 r23, [p2, dj5] +; CHECK-NEXT: st.s16 r0, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r1, [p7], #2 +; CHECK-NEXT: lda r0, [p7, #0] +; CHECK-NEXT: st.s16 r1, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r2, [p7], #2 +; CHECK-NEXT: st.s16 r2, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r3, [p7], #2 +; CHECK-NEXT: lda r2, [p7, #8] +; CHECK-NEXT: st.s16 r3, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r4, [p7], #2 +; CHECK-NEXT: lda r3, [p7, #12] +; CHECK-NEXT: lda p7, [sp, #-640] // 4-byte Folded Reload +; CHECK-NEXT: st.s16 r4, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s8 r4, [p0, #4] -; CHECK-NEXT: st.s16 r5, [p7], #2 +; CHECK-NEXT: lda.s8 r4, [p2, #4] +; CHECK-NEXT: st.s16 r5, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s8 r5, [p0, #5] -; CHECK-NEXT: st.s16 r6, [p7], #2 +; CHECK-NEXT: lda.s8 r5, [p2, #5] +; CHECK-NEXT: st.s16 r6, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova m0, #-272 -; CHECK-NEXT: st.s16 r7, [p7, #0] -; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: padda [p1], m0 -; CHECK-NEXT: mov r26, p1 -; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: padda [p1], #-256 -; CHECK-NEXT: mov p7, r29 -; CHECK-NEXT: lda r0, [p7], #4 -; CHECK-NEXT: lda r1, [p7, #0] -; CHECK-NEXT: mov p7, r29 -; CHECK-NEXT: lda r2, [p7, #8] -; CHECK-NEXT: lda r3, [p7, #12] -; CHECK-NEXT: lda.s8 r6, [p0, #6] -; CHECK-NEXT: lda.s8 r7, [p0, #7] -; CHECK-NEXT: mov p7, r26 -; CHECK-NEXT: st r0, [p7], #4 -; CHECK-NEXT: st r1, [p7], #4 -; CHECK-NEXT: st r2, [p7], #4 -; CHECK-NEXT: st r3, [p7, #0] -; CHECK-NEXT: mov p7, p0 -; CHECK-NEXT: lda.s8 r0, [p7], #1 -; CHECK-NEXT: st.s8 r0, [p1], #1 ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s8 r6, [p2, #6] +; CHECK-NEXT: st.s16 r7, [p0, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s8 r2, [p0, #2] -; CHECK-NEXT: lda.s8 r3, [p0, #3] -; CHECK-NEXT: mov p0, p2 -; CHECK-NEXT: lda.s8 r1, [p7, #0] -; CHECK-NEXT: lda.s16 r0, [p0], #2 -; CHECK-NEXT: lda p7, [sp, #-576] // 4-byte Folded Reload -; CHECK-NEXT: st.s8 r1, [p1], #1 +; CHECK-NEXT: mov p0, r30 +; CHECK-NEXT: lda.s8 r7, [p2, #7] +; CHECK-NEXT: lda r1, [p0, #0] +; CHECK-NEXT: mov p0, r26 +; CHECK-NEXT: st r0, [p0], #4 +; CHECK-NEXT: lda.s8 r0, [p2, #0] +; CHECK-NEXT: st.s8 r0, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r1, [p0, #0] -; CHECK-NEXT: st.s8 r2, [p1], #1 +; CHECK-NEXT: lda.s16 r0, [p4, #0] +; CHECK-NEXT: st r1, [p0], #4 +; CHECK-NEXT: st r2, [p0], #4 +; CHECK-NEXT: lda.s8 r2, [p2, #2] +; CHECK-NEXT: st r3, [p0, #0] +; CHECK-NEXT: lda.s8 r3, [p2, #3] +; CHECK-NEXT: mov p0, r12 +; CHECK-NEXT: lda r12, [sp, #-620] // 4-byte Folded Reload +; CHECK-NEXT: lda r11, [sp, #-616] // 4-byte Folded Reload +; CHECK-NEXT: lda.s8 r1, [p0, #0] +; CHECK-NEXT: lda r13, [sp, #-624] // 4-byte Folded Reload +; CHECK-NEXT: st.s8 r1, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: mov p2, r11 ; CHECK-NEXT: nop +; CHECK-NEXT: mov p0, r13 +; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r1, [p0, #0] +; CHECK-NEXT: lda r10, [sp, #-612] // 4-byte Folded Reload +; CHECK-NEXT: st.s8 r2, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r2, [p2, #4] -; CHECK-NEXT: st.s8 r3, [p1], #1 ; CHECK-NEXT: nop +; CHECK-NEXT: mov p0, r10 ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r2, [p4, #4] +; CHECK-NEXT: st.s8 r3, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r3, [p2, #6] -; CHECK-NEXT: st.s8 r4, [p1], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r3, [p4, #6] +; CHECK-NEXT: st.s8 r4, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r4, [p2, #8] -; CHECK-NEXT: st.s8 r5, [p1], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r4, [p4, #8] +; CHECK-NEXT: st.s8 r5, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r5, [p2, #10] -; CHECK-NEXT: st.s8 r6, [p1], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r5, [p4, #10] +; CHECK-NEXT: st.s8 r6, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r6, [p2, #12] -; CHECK-NEXT: st.s8 r7, [p1], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r6, [p4, #12] +; CHECK-NEXT: st.s8 r7, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r7, [p2, #14] -; CHECK-NEXT: st.s8 r16, [p1], #1 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r7, [p4, #14] +; CHECK-NEXT: st.s8 r16, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj5, #16 -; CHECK-NEXT: lda.s16 r16, [p2, dj5] -; CHECK-NEXT: st.s8 r17, [p1], #1 +; CHECK-NEXT: lda.s16 r16, [p4, dj5] +; CHECK-NEXT: st.s8 r17, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj5, #18 -; CHECK-NEXT: lda.s16 r17, [p2, dj5] -; CHECK-NEXT: st.s8 r18, [p1], #1 +; CHECK-NEXT: lda.s16 r17, [p4, dj5] +; CHECK-NEXT: st.s8 r18, [p3], #1 +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: mova dj5, #20 +; CHECK-NEXT: lda.s16 r18, [p4, dj5] +; CHECK-NEXT: st.s8 r19, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r19, [p1], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: mova dj5, #22 +; CHECK-NEXT: lda.s16 r19, [p4, dj5] +; CHECK-NEXT: st.s8 r20, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r20, [p1], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: mova dj5, #24 +; CHECK-NEXT: lda.s16 r20, [p4, dj5] +; CHECK-NEXT: st.s8 r21, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r21, [p1], #1 ; CHECK-NEXT: nop +; CHECK-NEXT: mova dj5, #26 +; CHECK-NEXT: lda.s16 r21, [p4, dj5] +; CHECK-NEXT: st.s8 r22, [p3], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r22, [p1], #1 +; CHECK-NEXT: mova dj5, #28 +; CHECK-NEXT: lda.s16 r22, [p4, dj5] +; CHECK-NEXT: st.s8 r23, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: mova dj5, #30 +; CHECK-NEXT: lda.s16 r23, [p4, dj5] +; CHECK-NEXT: lda r15, [sp, #-632] // 4-byte Folded Reload +; CHECK-NEXT: st.s16 r0, [p5], #2 ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r23, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p3, sp -; CHECK-NEXT: mova m0, #-240 -; CHECK-NEXT: padda [p3], m0 -; CHECK-NEXT: st.s16 r0, [p3], #2 +; CHECK-NEXT: mov p3, r15 ; CHECK-NEXT: nop +; CHECK-NEXT: lda r0, [p0, #0] +; CHECK-NEXT: st.s16 r1, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r1, [p3], #2 ; CHECK-NEXT: nop +; CHECK-NEXT: lda r1, [p2, #0] +; CHECK-NEXT: lda r14, [sp, #-628] // 4-byte Folded Reload +; CHECK-NEXT: st.s16 r2, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova dj5, #20 -; CHECK-NEXT: lda.s16 r18, [p2, dj5] -; CHECK-NEXT: mova dj5, #22 -; CHECK-NEXT: lda.s16 r19, [p2, dj5] -; CHECK-NEXT: mova dj5, #24 -; CHECK-NEXT: lda.s16 r20, [p2, dj5] -; CHECK-NEXT: mova dj5, #26 -; CHECK-NEXT: lda.s16 r21, [p2, dj5] -; CHECK-NEXT: mova dj5, #28 -; CHECK-NEXT: lda.s16 r22, [p2, dj5] -; CHECK-NEXT: mova dj5, #30 -; CHECK-NEXT: mov p1, r30 -; CHECK-NEXT: lda.s16 r23, [p2, dj5] -; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: lda r0, [p0], #4 -; CHECK-NEXT: lda r1, [p0, #0] -; CHECK-NEXT: st.s16 r2, [p3], #2 +; CHECK-NEXT: mov p2, r14 ; CHECK-NEXT: nop +; CHECK-NEXT: lda r2, [p0, #8] +; CHECK-NEXT: st.s16 r3, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r2, [p1, #8] -; CHECK-NEXT: st.s16 r3, [p3], #2 ; CHECK-NEXT: nop +; CHECK-NEXT: lda r3, [p0, #12] +; CHECK-NEXT: st.s16 r4, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r3, [p1, #12] -; CHECK-NEXT: st.s16 r4, [p3], #2 ; CHECK-NEXT: nop +; CHECK-NEXT: lda r4, [p0, #16] +; CHECK-NEXT: st.s16 r5, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r4, [p1, #16] -; CHECK-NEXT: st.s16 r5, [p3], #2 ; CHECK-NEXT: nop +; CHECK-NEXT: lda r5, [p0, #20] +; CHECK-NEXT: st.s16 r6, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r5, [p1, #20] -; CHECK-NEXT: st.s16 r6, [p3], #2 ; CHECK-NEXT: nop +; CHECK-NEXT: lda r6, [p0, #24] +; CHECK-NEXT: st.s16 r7, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r6, [p1, #24] -; CHECK-NEXT: st.s16 r7, [p3], #2 ; CHECK-NEXT: nop -; CHECK-NEXT: mov p4, sp -; CHECK-NEXT: mova m0, #-208 -; CHECK-NEXT: padda [p4], m0 -; CHECK-NEXT: mov r25, p4 -; CHECK-NEXT: mov p0, r25 -; CHECK-NEXT: lda r7, [p1, #28] +; CHECK-NEXT: lda r7, [p0, #28] +; CHECK-NEXT: mov p0, r29 ; CHECK-NEXT: st r0, [p0], #4 -; CHECK-NEXT: st.s16 r16, [p3], #2 +; CHECK-NEXT: st.s16 r16, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p1, r31 +; CHECK-NEXT: nop +; CHECK-NEXT: lda r16, [p1, dj4] ; CHECK-NEXT: st r1, [p0], #4 -; CHECK-NEXT: lda r1, [p1, #4] -; CHECK-NEXT: st.s16 r17, [p3], #2 +; CHECK-NEXT: lda r1, [p3, #4] +; CHECK-NEXT: st.s16 r17, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -710,16 +777,17 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r2, [p0], #4 -; CHECK-NEXT: lda r2, [p1, #8] -; CHECK-NEXT: st.s16 r18, [p3], #2 +; CHECK-NEXT: lda r2, [p2, #0] +; CHECK-NEXT: st.s16 r18, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: lda r18, [p1, dj6] ; CHECK-NEXT: st r3, [p0], #4 -; CHECK-NEXT: st.s16 r19, [p3], #2 +; CHECK-NEXT: st.s16 r19, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -727,8 +795,8 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r4, [p0], #4 -; CHECK-NEXT: lda r4, [p1, #16] -; CHECK-NEXT: st.s16 r20, [p3], #2 +; CHECK-NEXT: lda r4, [p3, #16] +; CHECK-NEXT: st.s16 r20, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -736,7 +804,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r5, [p0], #4 -; CHECK-NEXT: st.s16 r21, [p3], #2 +; CHECK-NEXT: st.s16 r21, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -744,37 +812,34 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r6, [p0], #4 -; CHECK-NEXT: lda r6, [p1, #24] -; CHECK-NEXT: st.s16 r22, [p3], #2 -; CHECK-NEXT: mov p6, sp -; CHECK-NEXT: mova m0, #-176 -; CHECK-NEXT: mov p4, sp -; CHECK-NEXT: padda [p4], m0 -; CHECK-NEXT: mova m0, #-144 -; CHECK-NEXT: mov r24, p4 +; CHECK-NEXT: st.s16 r22, [p5], #2 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop ; CHECK-NEXT: st r7, [p0, #0] -; CHECK-NEXT: lda r8, [sp, #-568] // 4-byte Folded Reload -; CHECK-NEXT: st.s16 r23, [p3, #0] -; CHECK-NEXT: mov p4, sp -; CHECK-NEXT: padda [p4], m0 -; CHECK-NEXT: mova m0, #-80 -; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: mov p1, r8 -; CHECK-NEXT: padda [p6], m0 -; CHECK-NEXT: lda r16, [p1, dj4] -; CHECK-NEXT: lda r18, [p1, dj6] +; CHECK-NEXT: st.s16 r23, [p5, #0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov p2, r24 +; CHECK-NEXT: mov p0, p3 ; CHECK-NEXT: lda r0, [p0], #12 +; CHECK-NEXT: lda p0, [sp, #-44] // 4-byte Folded Reload ; CHECK-NEXT: lda r20, [p1, dj1] +; CHECK-NEXT: lda r6, [p3, #24] ; CHECK-NEXT: lda r22, [p1, dj0] ; CHECK-NEXT: lda r3, [p0], #8 ; CHECK-NEXT: lda r5, [p0], #8 ; CHECK-NEXT: lda r7, [p0, #0] -; CHECK-NEXT: mov p0, r24 ; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: st r1, [p0], #4 ; CHECK-NEXT: lda r1, [p1, #4] ; CHECK-NEXT: st r2, [p0], #4 -; CHECK-NEXT: lda r2, [p1, #8] +; CHECK-NEXT: lda r2, [p2, #0] ; CHECK-NEXT: st r3, [p0], #4 ; CHECK-NEXT: st r4, [p0], #4 ; CHECK-NEXT: lda r4, [p1, #16] @@ -783,6 +848,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: lda r6, [p1, #24] ; CHECK-NEXT: st r7, [p0, #0] ; CHECK-NEXT: mov p0, p1 +; CHECK-NEXT: mov p1, r25 ; CHECK-NEXT: lda r0, [p0], #12 ; CHECK-NEXT: lda r3, [p0], #8 ; CHECK-NEXT: lda r5, [p0], #8 @@ -791,59 +857,61 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: lda r19, [p0], #8 ; CHECK-NEXT: lda r21, [p0], #8 ; CHECK-NEXT: lda r23, [p0, #0] -; CHECK-NEXT: mov p0, p5 -; CHECK-NEXT: st r0, [p4], #4 -; CHECK-NEXT: st r1, [p4], #4 -; CHECK-NEXT: lda r0, [p0], #4 -; CHECK-NEXT: st r2, [p4], #4 -; CHECK-NEXT: lda r1, [p0, #0] -; CHECK-NEXT: lda r2, [p5, #8] -; CHECK-NEXT: st r3, [p4], #4 -; CHECK-NEXT: lda r3, [p5, #12] -; CHECK-NEXT: st r4, [p4], #4 -; CHECK-NEXT: lda r4, [p5, #16] -; CHECK-NEXT: st r5, [p4], #4 -; CHECK-NEXT: lda r5, [p5, #20] -; CHECK-NEXT: st r6, [p4], #4 -; CHECK-NEXT: lda r6, [p5, #24] -; CHECK-NEXT: st r7, [p4], #4 -; CHECK-NEXT: lda r7, [p5, #28] -; CHECK-NEXT: st r16, [p4], #4 -; CHECK-NEXT: lda r16, [p5, dj4] -; CHECK-NEXT: mova dj4, #36 +; CHECK-NEXT: mov p0, r8 ; CHECK-NEXT: st r0, [p6], #4 -; CHECK-NEXT: st r17, [p4], #4 +; CHECK-NEXT: lda r8, [sp, #-604] // 4-byte Folded Reload ; CHECK-NEXT: st r1, [p6], #4 -; CHECK-NEXT: lda r17, [p5, dj4] -; CHECK-NEXT: st r18, [p4], #4 +; CHECK-NEXT: lda r0, [p0, #0] +; CHECK-NEXT: lda r1, [p1, #0] ; CHECK-NEXT: st r2, [p6], #4 -; CHECK-NEXT: lda r18, [p5, dj6] -; CHECK-NEXT: st r19, [p4], #4 +; CHECK-NEXT: lda r2, [p0, #8] ; CHECK-NEXT: st r3, [p6], #4 -; CHECK-NEXT: lda r19, [p5, dj7] -; CHECK-NEXT: st r20, [p4], #4 +; CHECK-NEXT: lda r3, [p0, #12] ; CHECK-NEXT: st r4, [p6], #4 -; CHECK-NEXT: lda r20, [p5, dj1] -; CHECK-NEXT: st r21, [p4], #4 +; CHECK-NEXT: lda r4, [p0, #16] ; CHECK-NEXT: st r5, [p6], #4 -; CHECK-NEXT: lda r21, [p5, dj3] -; CHECK-NEXT: st r22, [p4], #4 +; CHECK-NEXT: lda r5, [p0, #20] ; CHECK-NEXT: st r6, [p6], #4 -; CHECK-NEXT: lda r22, [p5, dj0] -; CHECK-NEXT: st r23, [p4, #0] +; CHECK-NEXT: lda r6, [p0, #24] ; CHECK-NEXT: st r7, [p6], #4 -; CHECK-NEXT: lda r23, [p5, dj2] +; CHECK-NEXT: lda r7, [p0, #28] ; CHECK-NEXT: st r16, [p6], #4 +; CHECK-NEXT: lda r16, [p0, dj4] +; CHECK-NEXT: mova dj4, #36 ; CHECK-NEXT: st r17, [p6], #4 +; CHECK-NEXT: lda r17, [p0, dj4] ; CHECK-NEXT: st r18, [p6], #4 +; CHECK-NEXT: lda r18, [p0, dj6] ; CHECK-NEXT: st r19, [p6], #4 +; CHECK-NEXT: lda r19, [p0, dj7] ; CHECK-NEXT: st r20, [p6], #4 -; CHECK-NEXT: lda p6, [sp, #-572] // 4-byte Folded Reload +; CHECK-NEXT: lda r20, [p0, dj1] +; CHECK-NEXT: st r21, [p6], #4 +; CHECK-NEXT: lda r21, [p0, dj3] +; CHECK-NEXT: lda p0, [sp, #-48] // 4-byte Folded Reload +; CHECK-NEXT: st r22, [p6], #4 +; CHECK-NEXT: lda r22, [p0, dj0] +; CHECK-NEXT: st r23, [p6, #0] +; CHECK-NEXT: lda r23, [p0, dj2] +; CHECK-NEXT: lda p6, [sp, #-636] // 4-byte Folded Reload +; CHECK-NEXT: paddxm [sp], #-640 +; CHECK-NEXT: st r0, [p0], #4 +; CHECK-NEXT: st r1, [p0], #4 +; CHECK-NEXT: st r2, [p0], #4 +; CHECK-NEXT: st r3, [p0], #4 +; CHECK-NEXT: st r4, [p0], #4 +; CHECK-NEXT: st r5, [p0], #4 +; CHECK-NEXT: st r6, [p0], #4 +; CHECK-NEXT: st r7, [p0], #4 +; CHECK-NEXT: st r16, [p0], #4 +; CHECK-NEXT: st r17, [p0], #4 +; CHECK-NEXT: st r18, [p0], #4 +; CHECK-NEXT: st r19, [p0], #4 ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r21, [p6], #4 // Delay Slot 5 -; CHECK-NEXT: st r22, [p6], #4 // Delay Slot 4 -; CHECK-NEXT: st r23, [p6, #0] // Delay Slot 3 -; CHECK-NEXT: paddxm [sp], #-576 // Delay Slot 2 +; CHECK-NEXT: st r20, [p0], #4 // Delay Slot 5 +; CHECK-NEXT: st r21, [p0], #4 // Delay Slot 4 +; CHECK-NEXT: st r22, [p0], #4 // Delay Slot 3 +; CHECK-NEXT: st r23, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %a.addr = alloca <8 x i16>, align 8 From 2dc1a42c5b404f2628b946eedd3f2de61cf0249e Mon Sep 17 00:00:00 2001 From: Fabian Stuckmann Date: Wed, 21 May 2025 01:58:59 -0600 Subject: [PATCH 5/6] [AIEX] added full search for global combiners --- llvm/lib/Target/AIE/AIEGlobalCombiner.cpp | 88 +++ .../AIE/GlobalISel/legalize-dyn-stackalloc.ll | 37 +- .../GlobalISel/postinc-with-clustering.mir | 42 +- .../GlobalISel/prologepilog-tail-call-opt.ll | 4 +- .../CodeGen/AIE/aie2/conv2d_offset_test.ll | 34 +- .../CodeGen/AIE/aie2/end-to-end/Add2D-red.ll | 115 ++-- .../CodeGen/AIE/aie2/end-to-end/Memops.ll | 63 +- llvm/test/CodeGen/AIE/aie2/movxm_test.ll | 4 +- llvm/test/CodeGen/AIE/aie2/vabs_gtz.ll | 12 +- llvm/test/CodeGen/AIE/aie2/vbneg_ltz.ll | 4 +- llvm/test/CodeGen/AIE/aie2/vmax_lt.ll | 16 +- llvm/test/CodeGen/AIE/aie2/vmaxdiff_lt.ll | 12 +- llvm/test/CodeGen/AIE/aie2/vmin_ge.ll | 16 +- llvm/test/CodeGen/AIE/aie2/vst_srs.ll | 4 +- llvm/test/CodeGen/AIE/aie2/vsub-cmp.ll | 24 +- .../global-combiners/MBB-crossing.mir | 8 +- .../global-combiners/post-inc-eagerness.mir | 14 +- .../global-combiners/reorder-Mem-Instrs.mir | 7 +- .../global-combiners/user-intrinsics.mir | 12 +- .../GlobalIsel/prologepilog-tail-call-opt.ll | 4 +- llvm/test/CodeGen/AIE/aie2p/Memops.ll | 68 +- .../aie2p/end-to-end/conv2d_bfp16_convert.ll | 14 +- llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll | 226 +++---- .../CodeGen/AIE/aie2p/ldst-fifo-stores.ll | 160 +++-- .../CodeGen/AIE/aie2p/load-store-unaligned.ll | 599 +++++++++--------- llvm/test/CodeGen/AIE/dyn-stackalloc.ll | 37 +- 26 files changed, 835 insertions(+), 789 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp b/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp index 59448d9f5862..5d8cec05d34f 100644 --- a/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp +++ b/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp @@ -211,6 +211,94 @@ std::vector CombineCandidates::searchCombinerSet( // seed greedy solution auto BestSolution = getGreedySolution(); + std::priority_queue PQ; + // Add Start Position + PQ.emplace(NumCombiner); + LLVM_DEBUG(dbgs() << MBB->getName() << " - Initial Search Start " + << BestSolution << "\n"); + + // The search algorithm finds the optimal combination of combiners to + // maximize gain. It starts with a greedy solution and iteratively + // explores the solution space using a priority queue. + // + // The Index within the CombinerSolution represents the index of the next + // combiner in the Combiners vector to consider. The Index is updated when a + // new CombinerSolution is created and pushed onto the priority queue, either + // by including the current combiner (Index increments by 1) or skipping it + // (Index increments by 1). Including a combiner always guarantees that the + // applied combiners are conflict free. + // + // The search prunes branches that cannot surpass the current best solution, + // and prioritizes candidates based on their potential future gain. The + // algorithm considers both applying and skipping each combiner to find the + // best overall combination. The priority queue ensures that we explore + // promising solutions first. When popping an element of the priority queue, + // we also remove it from the queue, thereby guaranteeing we don't search the + // same solution root twice. The algorithm is exhaustive, limited only by the + // maximum number of iterations. + int Iteration = 0; + while (!PQ.empty() && Iteration < MaxSearchIterationCount) { + Iteration++; + + // Get best Candidate to continue searching + const CombinerSolution Current = PQ.top(); + PQ.pop(); + LLVM_DEBUG(dbgs() << "Search " << Current + << " MaxGain = " << Current.getMaxFutureGain() << "\n"); + + // Check if Current has finished the search + if (Current.getIndex() == Combiners.size()) { + if (Current.getGain() > BestSolution.getGain()) { + LLVM_DEBUG(dbgs() << " [Search] Updated Optimal Combiner " << Current + << "\n"); + BestSolution = Current; + } + continue; + } + + const GenericCombiner *Candidate = Combiners[Current.getIndex()]; + + // Check if search can be stopped for Current + const auto PotentialGain = getMaxPotentialGain(Current, Current.getIndex()); + if (BestSolution.getGain() > PotentialGain) { + LLVM_DEBUG( + dbgs() << " [Search] Cannot Surpass BestScore, skipping exploration " + << *Candidate << " Candidate: " << Candidate->getGain() + << " Overlap= " + << Candidate->getOverlapGain(Current.getCombinersBitVector(), + Combiners) + << " MaxGain = " << PotentialGain << "\n"); + continue; + } + + if (Current.hasConflict(Candidate)) { + LLVM_DEBUG(dbgs() << " Conflict, adding Non-Conflict Variant " + << *Candidate << "Candidate: " << Candidate->getGain() + << "\n"); + } else { + PQ.emplace(Current, Candidate, PotentialGain, Current.getIndex(), + Combiners); + LLVM_DEBUG(dbgs() << " Adding to Stack: " << *Candidate + << "Candidate: " << Candidate->getGain() + << " MaxGain = " << PotentialGain << "\n"); + } + + // Add a solution where no combiner is applied, if it could Surpass + // CurrentBest + const auto NoneMaxFutureGain = + getMaxPotentialGain(Current, Current.getIndex() + 1); + if (BestSolution.getGain() < NoneMaxFutureGain) { + LLVM_DEBUG(dbgs() << " Adding to Stack: None MaxGain = " + << NoneMaxFutureGain << "\n"); + // Idx is not needed, since we do not apply any Combiner + PQ.emplace(Current, nullptr, NoneMaxFutureGain, /*Idx=*/-1, Combiners); + } + } // end while + + LLVM_DEBUG(dbgs() << "Search Iterations: " << Iteration << "\n"); + LLVM_DEBUG(dbgs() << "Search Result " << BestSolution.getGain() << "\n"); + + // Save best Candidate to FixedCombiners std::vector Result; BitVector CombinerBitVec = BestSolution.getCombinersBitVector(); for (int Idx = CombinerBitVec.find_first(); Idx != -1; diff --git a/llvm/test/CodeGen/AIE/GlobalISel/legalize-dyn-stackalloc.ll b/llvm/test/CodeGen/AIE/GlobalISel/legalize-dyn-stackalloc.ll index 8da3e5784bbb..595e65e4c205 100644 --- a/llvm/test/CodeGen/AIE/GlobalISel/legalize-dyn-stackalloc.ll +++ b/llvm/test/CodeGen/AIE/GlobalISel/legalize-dyn-stackalloc.ll @@ -246,36 +246,37 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; AIE2-NEXT: padda [p7], m0 ; AIE2-NEXT: movxm m0, #-40048 ; AIE2-NEXT: mov p0, p7 +; AIE2-NEXT: mov p3, p7 ; AIE2-NEXT: mov p2, p7 ; AIE2-NEXT: mov p6, p7 +; AIE2-NEXT: paddb [p3], #-32 ; AIE2-NEXT: paddb [p0], m0 -; AIE2-NEXT: paddb [p6], #-32 -; AIE2-NEXT: movxm m0, #-40032 ; AIE2-NEXT: st r0, [p0, #0] ; AIE2-NEXT: lda r0, [p0, #0] -; AIE2-NEXT: paddb [p2], m0 +; AIE2-NEXT: paddb [p2], #-24 +; AIE2-NEXT: mov r16, p3 +; AIE2-NEXT: st r1, [p2, #4] ; AIE2-NEXT: mov p0, sp -; AIE2-NEXT: mov r16, p2 -; AIE2-NEXT: st p0, [p6, #0] -; AIE2-NEXT: mov p0, p7 -; AIE2-NEXT: paddb [p0], #-24 +; AIE2-NEXT: st p0, [p3, #0] +; AIE2-NEXT: mov p0, p1 ; AIE2-NEXT: lshl r2, r0, r2 -; AIE2-NEXT: st r0, [p0], #4 +; AIE2-NEXT: st r0, [p2, #0] ; AIE2-NEXT: add r2, r2, #31 -; AIE2-NEXT: st r1, [p0, #0] +; AIE2-NEXT: and r2, r2, r3 ; AIE2-NEXT: jl #extern_call -; AIE2-NEXT: mov p0, p1 // Delay Slot 5 -; AIE2-NEXT: and r2, r2, r3 // Delay Slot 4 -; AIE2-NEXT: mov m0, r2 // Delay Slot 3 -; AIE2-NEXT: paddb [p1], m0 // Delay Slot 2 +; AIE2-NEXT: mov m0, r2 // Delay Slot 5 +; AIE2-NEXT: paddb [p1], m0 // Delay Slot 4 +; AIE2-NEXT: movxm m0, #-40032 // Delay Slot 3 +; AIE2-NEXT: paddb [p6], m0 // Delay Slot 2 ; AIE2-NEXT: mov sp, p1 // Delay Slot 1 ; AIE2-NEXT: nopb ; nopa ; nops ; jl #extern_call; nopv ; AIE2-NEXT: nopa ; nopx // Delay Slot 5 ; AIE2-NEXT: nop // Delay Slot 4 ; AIE2-NEXT: nop // Delay Slot 3 ; AIE2-NEXT: nop // Delay Slot 2 -; AIE2-NEXT: mov p0, r16 // Delay Slot 1 -; AIE2-NEXT: lda p0, [p6, #0]; nopx +; AIE2-NEXT: mov p0, p6 // Delay Slot 1 +; AIE2-NEXT: nopb ; nopa ; nops ; nopx ; mov p0, r16; nopv +; AIE2-NEXT: lda p0, [p0, #0]; nopx ; AIE2-NEXT: nop ; AIE2-NEXT: nop ; AIE2-NEXT: nop @@ -329,18 +330,18 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; AIE2P-NEXT: padda [p0], m0 ; AIE2P-NEXT: mova m0, #-32 ; AIE2P-NEXT: padda [p3], m0 +; AIE2P-NEXT: mova m0, #-24 ; AIE2P-NEXT: st r0, [p0, #0] ; AIE2P-NEXT: lda r0, [p0, #0] -; AIE2P-NEXT: mova m0, #-24 ; AIE2P-NEXT: mov p0, sp ; AIE2P-NEXT: mov r8, p3 ; AIE2P-NEXT: padda [p2], m0 ; AIE2P-NEXT: st p0, [p3, #0] ; AIE2P-NEXT: mov p0, p1 +; AIE2P-NEXT: st r1, [p2, #4] ; AIE2P-NEXT: lshl r2, r0, r2 -; AIE2P-NEXT: st r0, [p2], #4 +; AIE2P-NEXT: st r0, [p2, #0] ; AIE2P-NEXT: add r2, r2, #63 -; AIE2P-NEXT: st r1, [p2, #0] ; AIE2P-NEXT: and r2, r2, r3 ; AIE2P-NEXT: jl #extern_call ; AIE2P-NEXT: mov m0, r2 // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/GlobalISel/postinc-with-clustering.mir b/llvm/test/CodeGen/AIE/GlobalISel/postinc-with-clustering.mir index f730b08976dd..5f29d5662cf4 100644 --- a/llvm/test/CodeGen/AIE/GlobalISel/postinc-with-clustering.mir +++ b/llvm/test/CodeGen/AIE/GlobalISel/postinc-with-clustering.mir @@ -19,15 +19,15 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 4 - ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](s32), [[COPY1]], [[C1]](s20) :: (store (s32)) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_LOAD2:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD3:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[AIE_POSTINC_LOAD1]], [[C2]](s20) :: (load (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[LOAD]](s32), [[COPY1]], [[C1]](s20) :: (store (s32)) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 12 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: G_STORE [[AIE_OFFSET_LOAD]](s32), [[AIE_POSTINC_STORE]](p0) :: (store (s32)) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_STORE1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD2]](s32), [[AIE_POSTINC_STORE]], [[C3]](s20) :: (store (s32)) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[AIE_POSTINC_LOAD3]](p0) :: (load (s32)) - ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[AIE_POSTINC_STORE1]](p0) :: (store (s32)) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C2]](s20) :: (load (s32)) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[AIE_OFFSET_LOAD1]](s32), [[AIE_POSTINC_STORE]](p0), [[C3]](s20) :: (store (s32)) %0:_(p0) = COPY $p0 %1:_(p0) = COPY $p1 %2:_(s20) = G_CONSTANT i20 4 @@ -59,15 +59,15 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 4 - ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](s32), [[COPY1]], [[C1]](s20) :: (store (s32)) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_LOAD2:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD3:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[AIE_POSTINC_LOAD1]], [[C2]](s20) :: (load (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[LOAD]](s32), [[COPY1]], [[C1]](s20) :: (store (s32)) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 12 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: G_STORE [[AIE_OFFSET_LOAD]](s32), [[AIE_POSTINC_STORE]](p0) :: (store (s32)) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_STORE1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD2]](s32), [[AIE_POSTINC_STORE]], [[C3]](s20) :: (store (s32)) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[AIE_POSTINC_LOAD3]](p0) :: (load (s32)) - ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[AIE_POSTINC_STORE1]](p0) :: (store (s32)) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C2]](s20) :: (load (s32)) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[AIE_OFFSET_LOAD1]](s32), [[AIE_POSTINC_STORE]](p0), [[C3]](s20) :: (store (s32)) %0:_(p0) = COPY $p0 %1:_(p0) = COPY $p1 %2:_(s20) = G_CONSTANT i20 4 @@ -119,15 +119,15 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 4 - ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD]](s32), [[COPY1]], [[C1]](s20) :: (store (s32)) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_LOAD2:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD3:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[AIE_POSTINC_LOAD1]], [[C2]](s20) :: (load (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[LOAD]](s32), [[COPY1]], [[C1]](s20) :: (store (s32)) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 12 + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: G_STORE [[AIE_OFFSET_LOAD]](s32), [[AIE_POSTINC_STORE]](p0) :: (store (s32)) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_STORE1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[AIE_POSTINC_LOAD2]](s32), [[AIE_POSTINC_STORE]], [[C3]](s20) :: (store (s32)) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[AIE_POSTINC_LOAD3]](p0) :: (load (s32)) - ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[AIE_POSTINC_STORE1]](p0) :: (store (s32)) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C2]](s20) :: (load (s32)) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[AIE_OFFSET_LOAD1]](s32), [[AIE_POSTINC_STORE]](p0), [[C3]](s20) :: (store (s32)) %0:_(p0) = COPY $p0 %1:_(p0) = COPY $p1 %2:_(s20) = G_CONSTANT i20 4 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prologepilog-tail-call-opt.ll b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prologepilog-tail-call-opt.ll index 55a7dfd50fc1..abbfdff474ba 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prologepilog-tail-call-opt.ll +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prologepilog-tail-call-opt.ll @@ -40,8 +40,8 @@ define dso_local void @_Z5test2PPv(ptr nocapture readonly %args) local_unnamed_a ; CHECK: bb.0.entry: ; CHECK-NEXT: liveins: $p0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $p1, renamable $p0 = LDA_dms_lda_pstm_nrm_imm killed renamable $p0, 4 :: (load (p0) from %ir.args, align 4) - ; CHECK-NEXT: renamable $p0 = LDA_dms_lda_idx_imm killed renamable $p0, 0 :: (load (p0) from %ir.arrayidx1, align 4) + ; CHECK-NEXT: renamable $p1 = LDA_dms_lda_idx_imm renamable $p0, 0 :: (load (p0) from %ir.args, align 4) + ; CHECK-NEXT: renamable $p0 = LDA_dms_lda_idx_imm killed renamable $p0, 4 :: (load (p0) from %ir.arrayidx1, align 4) ; CHECK-NEXT: renamable $r1 = LDA_dms_lda_idx_imm killed renamable $p1, 0 :: (load (s32) from %ir.0) ; CHECK-NEXT: renamable $r2 = LDA_dms_lda_idx_imm killed renamable $p0, 0 :: (load (s32) from %ir.2) ; CHECK-NEXT: PseudoJ_TCO_jump_imm @_Z4funcii, csr_aie2, implicit $r1, implicit $r2 diff --git a/llvm/test/CodeGen/AIE/aie2/conv2d_offset_test.ll b/llvm/test/CodeGen/AIE/aie2/conv2d_offset_test.ll index 09e4fb215f2a..337753cc86c5 100644 --- a/llvm/test/CodeGen/AIE/aie2/conv2d_offset_test.ll +++ b/llvm/test/CodeGen/AIE/aie2/conv2d_offset_test.ll @@ -20,29 +20,31 @@ define dso_local noundef i32 @_Z3foov() #0 { ; CHECK-LABEL: _Z3foov: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; nopa ; nops ; movxm p0, #(X+92); nopv -; CHECK-NEXT: mova dj0, #96 -; CHECK-NEXT: lda.u8 r1, [p0, dj0] -; CHECK-NEXT: mova dj0, #-68 -; CHECK-NEXT: lda r0, [p0, #8] -; CHECK-NEXT: lda r1, [p0, #0] -; CHECK-NEXT: lda.u16 r1, [p0, dj0] -; CHECK-NEXT: mova dj0, #-56 -; CHECK-NEXT: lda.u8 r1, [p0, dj0] +; CHECK-NEXT: nop ; movxm p1, #(X+92) +; CHECK-NEXT: mova m0, #-164 +; CHECK-NEXT: mov p0, p1 +; CHECK-NEXT: paddb [p0], #8 +; CHECK-NEXT: lda r0, [p0], #88 +; CHECK-NEXT: lda.u8 r1, [p0], m0 +; CHECK-NEXT: mova m0, #12 +; CHECK-NEXT: lda r1, [p1, #0] +; CHECK-NEXT: lda.u16 r1, [p0], m0 +; CHECK-NEXT: mova m0, #60 +; CHECK-NEXT: lda.u8 r1, [p0], m0 ; CHECK-NEXT: nop -; CHECK-NEXT: lda r1, [p0, #4] +; CHECK-NEXT: lda r1, [p0], #76 ; CHECK-NEXT: add r0, r0, r1 ; CHECK-NEXT: add r0, r0, r1 ; CHECK-NEXT: nop ; CHECK-NEXT: add r0, r0, r1 ; CHECK-NEXT: add r0, r0, r1 -; CHECK-NEXT: lda r1, [p0, #80] +; CHECK-NEXT: lda r1, [p0], #12 ; CHECK-NEXT: add r0, r0, r1 -; CHECK-NEXT: lda r1, [p0, #92] -; CHECK-NEXT: lda r1, [p0, #44] -; CHECK-NEXT: lda r1, [p0, #8] -; CHECK-NEXT: lda r1, [p0, #76] -; CHECK-NEXT: lda r1, [p0, #-60] +; CHECK-NEXT: lda r1, [p0], #-48 +; CHECK-NEXT: lda r1, [p0], #32 +; CHECK-NEXT: lda r1, [p1, #8] +; CHECK-NEXT: lda r1, [p0], #-136 +; CHECK-NEXT: lda r1, [p0, #0] ; CHECK-NEXT: add r0, r0, r1 ; CHECK-NEXT: ret lr ; CHECK-NEXT: add r0, r0, r1 // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll index cd2c5cd026c1..0238c5e44f6b 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll @@ -35,50 +35,50 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-LABEL: add2d: ; ASM: .p2align 4 ; ASM-NEXT: // %bb.0: // %newFuncRoot -; ASM-NEXT: nopb ; lda dn0, [p0, #88]; nops ; nopxm ; nopv -; ASM-NEXT: lda dj0, [p0, #80] -; ASM-NEXT: lda dn4, [p0, #92]; paddb [sp], #32 -; ASM-NEXT: lda r0, [p0, #32]; st p6, [sp, #-28] // 4-byte Folded Spill -; ASM-NEXT: lda r1, [p0, #20]; mov p6, sp -; ASM-NEXT: lda dj4, [p0, #84]; paddb [p6], #-36 -; ASM-NEXT: lda r3, [p6, #0]; mov p6, sp -; ASM-NEXT: lda m2, [p0, #40]; paddb [p6], #-40 -; ASM-NEXT: lda r5, [p6, #0]; mov p6, sp -; ASM-NEXT: lda m1, [p0, #60]; paddb [p6], #-44 -; ASM-NEXT: lda r8, [p6, #0]; mov p6, sp -; ASM-NEXT: lda m4, [p0, #36]; st r1, [p4, #0]; nez r2, r0 -; ASM-NEXT: lda m0, [p0, #96]; paddb [p6], #-48; st r2, [p5, #0] -; ASM-NEXT: lda r9, [p6, #0]; mov p6, sp -; ASM-NEXT: lda m5, [p0, #44]; paddb [p6], #-52; mov p4, r3 -; ASM-NEXT: lda r4, [p6, #0]; mov p6, sp -; ASM-NEXT: lda m3, [p0, #52]; st m1, [p4, #0] -; ASM-NEXT: lda r0, [p0, #48]; paddb [p6], #-56; mov p4, r5 -; ASM-NEXT: lda r6, [p6, #0]; mov p6, sp -; ASM-NEXT: lda r3, [p0, #64]; st m0, [p4, #0] -; ASM-NEXT: lda r5, [p0, #28]; paddb [p6], #-60; mov p4, r8 -; ASM-NEXT: lda r7, [p6, #0]; mov p6, sp -; ASM-NEXT: lda r4, [p0, #12]; mov p0, sp -; ASM-NEXT: paddb [p0], #-72; st dj0, [p4, #0] -; ASM-NEXT: lda p5, [p0, #0]; mov p4, r9 -; ASM-NEXT: mov p0, sp -; ASM-NEXT: paddb [p0], #-76; st dj4, [p4, #0] -; ASM-NEXT: lda r9, [p0, #0]; mov p0, sp -; ASM-NEXT: paddb [p0], #-80; mov p4, r4 -; ASM-NEXT: lda r10, [p0, #0]; mov p0, sp -; ASM-NEXT: paddb [p0], #-84 -; ASM-NEXT: lda r11, [p0, #0]; paddb [p6], #-64; mov p0, sp -; ASM-NEXT: lda p7, [p6, #0]; paddb [p0], #-88; mov p6, sp -; ASM-NEXT: lda r12, [p0, #0]; st dn0, [p4, #0] -; ASM-NEXT: mova r6, #1; paddb [p6], #-68; mov p0, r6 -; ASM-NEXT: lda p6, [p6, #0]; st dn4, [p0, #0]; ne r5, r5, r6 -; ASM-NEXT: mova r6, #3; add r7, r3, #-1; mov p0, r7 -; ASM-NEXT: ltu r7, r7, r6 -; ASM-NEXT: st p7, [sp, #-32]; jz r7, #.LBB0_2 // 4-byte Folded Spill -; ASM-NEXT: nez r0, r0; mov p4, sp // Delay Slot 5 -; ASM-NEXT: paddb [p4], #-92; st r0, [p0, #0] // Delay Slot 4 -; ASM-NEXT: lda p4, [p4, #0]; paddb [p2], m4; st r4, [p7, #0] // Delay Slot 3 -; ASM-NEXT: padda [p1], m2; paddb [p2], m5; and r8, r3, r6; st r5, [p6, #0] // Delay Slot 2 -; ASM-NEXT: mova r6, #0; paddb [p2], m3; st r8, [p5, #0] // Delay Slot 1 +; ASM-NEXT: paddb [p0], #40; nopx +; ASM-NEXT: lda m2, [p0], #-4 +; ASM-NEXT: lda m3, [p0], #8 +; ASM-NEXT: lda m5, [p0], #8 +; ASM-NEXT: lda m4, [p0], #-24 +; ASM-NEXT: lda r4, [p0], #36; paddb [sp], #32 +; ASM-NEXT: lda r2, [p0], #-32; st p7, [sp, #-32] // 4-byte Folded Spill +; ASM-NEXT: lda r0, [p0], #-12; st p6, [sp, #-28] // 4-byte Folded Spill +; ASM-NEXT: lda r1, [p0], #40; mov p6, sp +; ASM-NEXT: paddb [p6], #-36; mov p7, sp +; ASM-NEXT: lda r5, [p6, #0]; paddb [p7], #-40 +; ASM-NEXT: lda p7, [p7, #0]; mov p6, sp +; ASM-NEXT: lda m1, [p0], #36; paddb [p6], #-44 +; ASM-NEXT: lda p6, [p6, #0] +; ASM-NEXT: lda m0, [p0], #-8 +; ASM-NEXT: lda dn0, [p0], #-8; st r1, [p4, #0] +; ASM-NEXT: lda dj0, [p0], #12; nez r3, r0; mov p4, sp +; ASM-NEXT: lda dn4, [p0], #-8; st r3, [p5, #0] +; ASM-NEXT: lda dj4, [p0], #-36; paddb [p4], #-48; mov p5, r5 +; ASM-NEXT: lda p4, [p4, #0]; st m1, [p5, #0] +; ASM-NEXT: lda r0, [p0, #0]; mov p5, sp +; ASM-NEXT: lda r5, [p0, #-36]; paddb [p5], #-52 +; ASM-NEXT: lda p5, [p5, #0]; mov p0, sp +; ASM-NEXT: st m0, [p7, #0] +; ASM-NEXT: mov p7, sp +; ASM-NEXT: paddb [p7], #-56; st dj0, [p6, #0] +; ASM-NEXT: lda r6, [p7, #0]; mov p6, sp +; ASM-NEXT: paddb [p0], #-72; mov p7, sp +; ASM-NEXT: lda p0, [p0, #0]; paddb [p6], #-60; st dj4, [p4, #0] +; ASM-NEXT: lda r7, [p6, #0]; mov p4, sp +; ASM-NEXT: paddb [p4], #-76; mov p6, sp +; ASM-NEXT: lda r11, [p4, #0]; paddb [p7], #-64; mov p4, sp +; ASM-NEXT: lda p7, [p7, #0]; paddb [p6], #-68; st dn0, [p5, #0] +; ASM-NEXT: lda r8, [p6, #0]; paddb [p4], #-80; nez r0, r0; mov p5, r6 +; ASM-NEXT: lda p6, [p4, #0]; st dn4, [p5, #0]; movx r6, #1 +; ASM-NEXT: ne r4, r4, r6; mov p4, sp +; ASM-NEXT: mova r6, #3; paddb [p4], #-84; add r7, r2, #-1; mov p5, r7 +; ASM-NEXT: lda r9, [p4, #0]; ltu r7, r7, r6; mov p4, sp +; ASM-NEXT: st r0, [p5, #0]; paddb [p4], #-88; jz r7, #.LBB0_2 +; ASM-NEXT: lda r10, [p4, #0]; mov p4, sp // Delay Slot 5 +; ASM-NEXT: paddb [p4], #-92; st r5, [p7, #0] // Delay Slot 4 +; ASM-NEXT: lda p4, [p4, #0]; paddb [p2], m3; mov p7, r8 // Delay Slot 3 +; ASM-NEXT: st r4, [p7, #0]; paddb [p2], m5; and r8, r2, r6 // Delay Slot 2 +; ASM-NEXT: padda [p1], m2; paddb [p2], m4; movx r6, #0; st r8, [p0, #0] // Delay Slot 1 ; ASM-NEXT: // %bb.1: ; ASM-NEXT: nopb ; nopa ; nops ; j #.LBB0_5; nopv ; ASM-NEXT: nopa ; nopx // Delay Slot 5 @@ -92,17 +92,17 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; mov dc4, dc0 ; ASM-NEXT: vlda.3d.ups.s32.d8 cm2, s1, [p2], d0 ; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r2 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r3 ; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; mov s1, r1 ; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0 ; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0 ; ASM-NEXT: movxm ls, #.LBB0_3 ; ASM-NEXT: mova r6, #-4; movxm le, #.L_LEnd0 -; ASM-NEXT: and r3, r3, r6 -; ASM-NEXT: mova r6, #-2; add r3, r3, #-4 -; ASM-NEXT: lshl r3, r3, r6; mov crSRSSign, r5 -; ASM-NEXT: add r3, r3, #1; mov s0, r4 -; ASM-NEXT: add.nc lc, r3, #-1 +; ASM-NEXT: and r2, r2, r6 +; ASM-NEXT: mova r6, #-2; add r2, r2, #-4 +; ASM-NEXT: lshl r2, r2, r6; mov crSRSSign, r4 +; ASM-NEXT: add r2, r2, #1; mov s0, r5 +; ASM-NEXT: add.nc lc, r2, #-1 ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_3: // %for.body ; ASM-NEXT: // =>This Inner Loop Header: Depth=1 @@ -133,14 +133,13 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: mov crSRSSign, #0 ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_5: // %for.cond.cleanup.unr-lcssa.split -; ASM-NEXT: nopa ; mov p0, r12 -; ASM-NEXT: st r0, [p4, #0] -; ASM-NEXT: lda p7, [sp, #-32]; st r6, [p0, #0] // 4-byte Folded Reload -; ASM-NEXT: lda p6, [sp, #-28]; mov p0, r11 // 4-byte Folded Reload -; ASM-NEXT: st p3, [p0, #0]; ret lr -; ASM-NEXT: mov p0, r10 // Delay Slot 5 -; ASM-NEXT: st p2, [p0, #0] // Delay Slot 4 -; ASM-NEXT: mov p0, r9 // Delay Slot 3 +; ASM-NEXT: nopx ; mov p0, r10 +; ASM-NEXT: lda p7, [sp, #-32]; st r0, [p4, #0] // 4-byte Folded Reload +; ASM-NEXT: lda p6, [sp, #-28]; st r6, [p0, #0] // 4-byte Folded Reload +; ASM-NEXT: ret lr ; mov p0, r9 +; ASM-NEXT: st p3, [p0, #0] // Delay Slot 5 +; ASM-NEXT: mov p0, r11 // Delay Slot 4 +; ASM-NEXT: st p2, [p6, #0] // Delay Slot 3 ; ASM-NEXT: st p1, [p0, #0] // Delay Slot 2 ; ASM-NEXT: paddb [sp], #-32 // Delay Slot 1 newFuncRoot: diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll index 998b261dff06..caf412862c58 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Memops.ll @@ -20,15 +20,15 @@ define dso_local void @lowerMemcpyUsingWord() local_unnamed_addr #0 { ; CHECK-NEXT: lda r1, [p0], #4 ; CHECK-NEXT: lda r2, [p0], #4 ; CHECK-NEXT: lda r3, [p0], #4 -; CHECK-NEXT: lda r4, [p0], #4 -; CHECK-NEXT: lda r5, [p0, #0] +; CHECK-NEXT: lda r4, [p0, #0] +; CHECK-NEXT: lda r5, [p0, #4] ; CHECK-NEXT: movxm p0, #buffer1 ; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: st r1, [p0], #4; ret lr ; CHECK-NEXT: st r2, [p0], #4 // Delay Slot 5 ; CHECK-NEXT: st r3, [p0], #4 // Delay Slot 4 -; CHECK-NEXT: st r4, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r5, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r4, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r5, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 4 dereferenceable(24) @buffer1, ptr noundef nonnull align 4 dereferenceable(24) @buffer2, i32 24, i1 false) @@ -73,16 +73,16 @@ define dso_local void @lowerMemcpyUsingHalfByte() local_unnamed_addr #0 { ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopa ; nopb ; movxm p0, #buffer2; nops -; CHECK-NEXT: lda.s16 r0, [p0], #2; movxm p1, #buffer1 -; CHECK-NEXT: st.s16 r0, [p1], #2 +; CHECK-NEXT: lda.s16 r0, [p0, #0]; movxm p1, #buffer1 +; CHECK-NEXT: st.s16 r0, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s8 r0, [p0, #0] -; CHECK-NEXT: st.s8 r0, [p1, #0] +; CHECK-NEXT: lda.s8 r0, [p0, #2] +; CHECK-NEXT: st.s8 r0, [p1, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 @@ -101,31 +101,37 @@ define dso_local void @lowerMemcpyUsingWordHalfByte() local_unnamed_addr #0 { ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; nopa ; nops ; movxm p0, #(buffer2+8); nopv -; CHECK-NEXT: lda.s16 r0, [p0, #0]; nopb ; nopx -; CHECK-NEXT: lda r1, [p0, #-4]; movxm p1, #(buffer1+8) +; CHECK-NEXT: lda.s16 r0, [p0, #0]; nopb ; movxm p1, #(buffer1+8) ; CHECK-NEXT: st.s16 r0, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: paddb [p0], #-8 +; CHECK-NEXT: lda r0, [p0], #4 ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s8 r0, [p0, #2] -; CHECK-NEXT: st.s8 r0, [p1, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: mova dj0, #6; paddb [p1], #-8 +; CHECK-NEXT: lda.s8 r0, [p0, dj0]; st r0, [p1], #4 +; CHECK-NEXT: st.s8 r0, [p1, dj0] +; CHECK-NEXT: nop +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r0, [p0, #-8] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: lda r0, [p0, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r1, [p1, #-4] // Delay Slot 3 -; CHECK-NEXT: st r0, [p1, #-8] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: st r0, [p1, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 4 dereferenceable(11) @buffer1, ptr noundef nonnull align 4 dereferenceable(11) @buffer2, i32 11, i1 false) @@ -159,17 +165,18 @@ define dso_local void @lowerMemcpyUsingWordVector16() local_unnamed_addr #0 { ; CHECK-LABEL: lowerMemcpyUsingWordVector16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nop ; movxm p0, #buffer2 -; CHECK-NEXT: lda q0, [p0], #16 -; CHECK-NEXT: lda q2, [p0], #16 -; CHECK-NEXT: lda r0, [p0, #0] +; CHECK-NEXT: nopb ; nopa ; nops ; movxm p0, #buffer2; nopv +; CHECK-NEXT: lda q0, [p0], #16; nopx +; CHECK-NEXT: vlda.128 wh0, [p0] +; CHECK-NEXT: lda r0, [p0, #16] +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: movxm p0, #buffer1 // Delay Slot 5 -; CHECK-NEXT: st q0, [p0], #16 // Delay Slot 4 -; CHECK-NEXT: st q2, [p0], #16 // Delay Slot 3 -; CHECK-NEXT: st r0, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st q0, [p0], #16; vmov q0, wh0 // Delay Slot 4 +; CHECK-NEXT: st r0, [p0, #16] // Delay Slot 3 +; CHECK-NEXT: st q0, [p0, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 16 dereferenceable(36) @buffer1, ptr noundef nonnull align 16 dereferenceable(36) @buffer2, i32 36, i1 false) @@ -185,15 +192,15 @@ define dso_local void @lowerMemcpyUsingWordVector32() local_unnamed_addr #0 { ; CHECK-NEXT: vldb wh0, [p0], #32 ; CHECK-NEXT: lda r0, [p0], #4 ; CHECK-NEXT: lda r1, [p0], #4 -; CHECK-NEXT: lda r2, [p0], #4 -; CHECK-NEXT: lda r3, [p0, #0] +; CHECK-NEXT: lda r2, [p0, #0] +; CHECK-NEXT: lda r3, [p0, #4] ; CHECK-NEXT: nop ; CHECK-NEXT: movxm p0, #buffer1 ; CHECK-NEXT: vst wh0, [p0], #32; ret lr ; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 5 ; CHECK-NEXT: st r1, [p0], #4 // Delay Slot 4 -; CHECK-NEXT: st r2, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r3, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r2, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r3, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 32 dereferenceable(48) @buffer1, ptr noundef nonnull align 32 dereferenceable(48) @buffer2, i32 48, i1 false) @@ -241,8 +248,8 @@ define dso_local void @lowerMemsetUsingWordVector32() local_unnamed_addr #2 { ; CHECK-NEXT: vst wl0, [p0], #32; ret lr ; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 5 ; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 4 -; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r0, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r0, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r0, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memset.p0.i32(ptr noundef nonnull align 32 dereferenceable(48) @buffer1, i8 0, i32 48, i1 false) diff --git a/llvm/test/CodeGen/AIE/aie2/movxm_test.ll b/llvm/test/CodeGen/AIE/aie2/movxm_test.ll index 0209d95d71e6..a457b6bbc339 100644 --- a/llvm/test/CodeGen/AIE/aie2/movxm_test.ll +++ b/llvm/test/CodeGen/AIE/aie2/movxm_test.ll @@ -16,8 +16,8 @@ define dso_local noundef i32 @_Z4getAv() { ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: movxm p0, #(d+4) -; CHECK-NEXT: lda r0, [p0], #4 -; CHECK-NEXT: lda r1, [p0, #0] +; CHECK-NEXT: lda r0, [p0, #0] +; CHECK-NEXT: lda r1, [p0, #4] ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/aie2/vabs_gtz.ll b/llvm/test/CodeGen/AIE/aie2/vabs_gtz.ll index a06857d4bd65..a1c9fe8c87a4 100644 --- a/llvm/test/CodeGen/AIE/aie2/vabs_gtz.ll +++ b/llvm/test/CodeGen/AIE/aie2/vabs_gtz.ll @@ -15,8 +15,8 @@ define dso_local noundef <64 x i8> @_Z20test_abs_gtz_v64int8Dv64_abRy(<64 x i8> ; CHECK-NEXT: ret lr ; CHECK-NEXT: vabs_gtz.d8 x0, r25:r24, x2 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 @@ -34,8 +34,8 @@ define dso_local noundef <64 x i8> @_Z27test_abs_gtz_v64uint8_sign0Dv64_hRy(<64 ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vabs_gtz.d8 x0, r25:r24, x2 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vabs.gtz8(<64 x i8> %a, i32 0) @@ -52,8 +52,8 @@ define dso_local noundef <64 x i8> @_Z27test_abs_gtz_v64uint8_sign1Dv64_hRy(<64 ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vabs_gtz.s8 x0, r25:r24, x2 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vabs.gtz8(<64 x i8> %a, i32 1) diff --git a/llvm/test/CodeGen/AIE/aie2/vbneg_ltz.ll b/llvm/test/CodeGen/AIE/aie2/vbneg_ltz.ll index 08f0da28aec6..d8be049608c1 100644 --- a/llvm/test/CodeGen/AIE/aie2/vbneg_ltz.ll +++ b/llvm/test/CodeGen/AIE/aie2/vbneg_ltz.ll @@ -14,8 +14,8 @@ define dso_local noundef <64 x i8> @_Z22test_bneg_ltz_v64uint8Dv64_hRy(<64 x i8> ; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops ; CHECK-NEXT: mova r0, #-1 // Delay Slot 5 ; CHECK-NEXT: vbneg_ltz.s8 x0, r25:r24, x2 // Delay Slot 4 -; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r0, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r0, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r0, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vbneg.ltz8(<64 x i8> %a) diff --git a/llvm/test/CodeGen/AIE/aie2/vmax_lt.ll b/llvm/test/CodeGen/AIE/aie2/vmax_lt.ll index 1947a0dd67f7..61207fb0a780 100644 --- a/llvm/test/CodeGen/AIE/aie2/vmax_lt.ll +++ b/llvm/test/CodeGen/AIE/aie2/vmax_lt.ll @@ -15,8 +15,8 @@ define dso_local noundef <64 x i8> @_Z20test_max_lt_v64uint8Dv64_hS_Ry(<64 x i8> ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vmax_lt.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vmax.lt8(<64 x i8> %a, <64 x i8> %b, i32 0) @@ -35,8 +35,8 @@ define dso_local noundef <64 x i8> @_Z20test_max_lt_v64uint8Dv64_hS_bRy(<64 x i8 ; CHECK-NEXT: ret lr ; CHECK-NEXT: vmax_lt.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 @@ -88,8 +88,8 @@ define dso_local noundef <64 x i8> @_Z19test_max_lt_v64int8Dv64_aS_Ry(<64 x i8> ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vmax_lt.s8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vmax.lt8(<64 x i8> %a, <64 x i8> %b, i32 1) @@ -108,8 +108,8 @@ define dso_local noundef <64 x i8> @_Z19test_max_lt_v64int8Dv64_aS_bRy(<64 x i8> ; CHECK-NEXT: ret lr ; CHECK-NEXT: vmax_lt.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 diff --git a/llvm/test/CodeGen/AIE/aie2/vmaxdiff_lt.ll b/llvm/test/CodeGen/AIE/aie2/vmaxdiff_lt.ll index a67d8d4e867d..a93d4ac798cf 100644 --- a/llvm/test/CodeGen/AIE/aie2/vmaxdiff_lt.ll +++ b/llvm/test/CodeGen/AIE/aie2/vmaxdiff_lt.ll @@ -18,8 +18,8 @@ define <64 x i8> @test_vmaxdiff_lt_v64int8(<64 x i8> %a, <64 x i8> %b, i1 %sgn, ; CHECK-NEXT: ret lr ; CHECK-NEXT: vmaxdiff_lt.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 @@ -37,8 +37,8 @@ define <64 x i8> @test_vmaxdiff_lt_v64uint8_sign0(<64 x i8> %a, <64 x i8> %b, ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vmax_lt.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vmaxdiff.lt8(<64 x i8> %a, <64 x i8> %b, i32 0) @@ -55,8 +55,8 @@ define <64 x i8> @test_vmaxdiff_lt_v64uint8_sign1(<64 x i8> %a, <64 x i8> %b, ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vmax_lt.s8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vmaxdiff.lt8(<64 x i8> %a, <64 x i8> %b, i32 1) diff --git a/llvm/test/CodeGen/AIE/aie2/vmin_ge.ll b/llvm/test/CodeGen/AIE/aie2/vmin_ge.ll index 6e4dc24218f1..681efcaed2bc 100644 --- a/llvm/test/CodeGen/AIE/aie2/vmin_ge.ll +++ b/llvm/test/CodeGen/AIE/aie2/vmin_ge.ll @@ -15,8 +15,8 @@ define dso_local noundef <64 x i8> @_Z20test_min_ge_v64uint8Dv64_hS_Ry(<64 x i8> ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vmin_ge.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vmin.ge8(<64 x i8> %a, <64 x i8> %b, i32 0) @@ -35,8 +35,8 @@ define dso_local noundef <64 x i8> @_Z20test_min_ge_v64uint8Dv64_hS_bRy(<64 x i8 ; CHECK-NEXT: ret lr ; CHECK-NEXT: vmin_ge.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 @@ -88,8 +88,8 @@ define dso_local noundef <64 x i8> @_Z19test_min_ge_v64int8Dv64_aS_Ry(<64 x i8> ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vmin_ge.s8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vmin.ge8(<64 x i8> %a, <64 x i8> %b, i32 1) @@ -108,8 +108,8 @@ define dso_local noundef <64 x i8> @_Z19test_min_ge_v64int8Dv64_aS_bRy(<64 x i8> ; CHECK-NEXT: ret lr ; CHECK-NEXT: vmin_ge.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 diff --git a/llvm/test/CodeGen/AIE/aie2/vst_srs.ll b/llvm/test/CodeGen/AIE/aie2/vst_srs.ll index a91417a6e0fb..0c056efc751a 100644 --- a/llvm/test/CodeGen/AIE/aie2/vst_srs.ll +++ b/llvm/test/CodeGen/AIE/aie2/vst_srs.ll @@ -381,10 +381,10 @@ define dso_local noundef <16 x i16> @test_postincrement(ptr %array, <8 x i64> no ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; mova r0, #2; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; mov s0, r0 -; CHECK-NEXT: vst.srs.d16.s32 bml0, s0, [p0], #32 +; CHECK-NEXT: vst.srs.d16.s32 bml0, s0, [p0, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vldb wl0, [p0, #0] +; CHECK-NEXT: vldb wl0, [p0, #32] ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 diff --git a/llvm/test/CodeGen/AIE/aie2/vsub-cmp.ll b/llvm/test/CodeGen/AIE/aie2/vsub-cmp.ll index 234f0f707a83..c3cd2002aa8d 100644 --- a/llvm/test/CodeGen/AIE/aie2/vsub-cmp.ll +++ b/llvm/test/CodeGen/AIE/aie2/vsub-cmp.ll @@ -17,8 +17,8 @@ define <64 x i8> @test_vsub_ge_v64int8(<64 x i8> %a, <64 x i8> %b, i1 %sgn, ptr ; CHECK-NEXT: ret lr ; CHECK-NEXT: vsub_ge.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 @@ -36,8 +36,8 @@ define <64 x i8> @test_vsub_ge_v64uint8_sign0(<64 x i8> %a, <64 x i8> %b, ptr ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vsub_ge.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vsub.ge8(<64 x i8> %a, <64 x i8> %b, i32 0) @@ -54,8 +54,8 @@ define <64 x i8> @test_vsub_ge_v64uint8_sign1(<64 x i8> %a, <64 x i8> %b, ptr ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vsub_ge.s8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vsub.ge8(<64 x i8> %a, <64 x i8> %b, i32 1) @@ -192,8 +192,8 @@ define <64 x i8> @test_vsub_lt_v64int8(<64 x i8> %a, <64 x i8> %b, i1 %sgn, ptr ; CHECK-NEXT: ret lr ; CHECK-NEXT: vsub_lt.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: mov crVaddSign, #0 // Delay Slot 1 entry: %conv.i = zext i1 %sgn to i32 @@ -211,8 +211,8 @@ define <64 x i8> @test_vsub_lt_v64uint8_sign0(<64 x i8> %a, <64 x i8> %b, ptr ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vsub_lt.d8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vsub.lt8(<64 x i8> %a, <64 x i8> %b, i32 0) @@ -229,8 +229,8 @@ define <64 x i8> @test_vsub_lt_v64uint8_sign1(<64 x i8> %a, <64 x i8> %b, ptr ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv ; CHECK-NEXT: vsub_lt.s8 x0, r25:r24, x2, x4 // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r24, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r25, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r24, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r25, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %0 = tail call { <64 x i8>, <2 x i32> } @llvm.aie2.vsub.lt8(<64 x i8> %a, <64 x i8> %b, i32 1) diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir index a0b092679584..eb6a2335e763 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/MBB-crossing.mir @@ -29,8 +29,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 192 - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[C1]](s32), [[COPY]], [[C]](s20) :: (store (s32)) - ; CHECK-NEXT: G_STORE [[C1]](s32), [[AIE_POSTINC_STORE]](p0) :: (store (s32)) + ; CHECK-NEXT: G_STORE [[C1]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[C1]](s32), [[COPY]](p0), [[C]](s20) :: (store (s32)) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: G_STORE [[C1]](s32), [[COPY]](p0) :: (store (s32)) @@ -122,8 +122,8 @@ body: | ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %6(s32), %bb.1, [[C3]](s32), %bb.0 ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[C4]] - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[ADD]](s32), [[COPY]], [[C2]](s20) :: (store (s32)) - ; CHECK-NEXT: G_STORE [[ADD]](s32), [[AIE_POSTINC_STORE]](p0) :: (store (s32)) + ; CHECK-NEXT: G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[ADD]](s32), [[COPY]](p0), [[C2]](s20) :: (store (s32)) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[ADD]](s32), [[C1]] ; CHECK-NEXT: G_BRCOND [[ICMP]](s32), %bb.1 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir index 9d3795839f82..d4ab16990614 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir @@ -31,11 +31,10 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 192 + ; CHECK-NEXT: [[AIE_POSTINC_3D_STORE:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_STORE1:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_STORE2:%[0-9]+]]:_ = G_AIE_POSTINC_3D_STORE [[C1]](s32), [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (store (s32)) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_STORE:%[0-9]+]]:_(p0) = G_AIE_POSTINC_STORE [[C1]](s32), [[COPY]], [[C2]](s20) :: (store (s32)) - ; CHECK-NEXT: G_STORE [[C1]](s32), [[AIE_POSTINC_STORE]](p0) :: (store (s32)) - ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[INT]](p0) + ; CHECK-NEXT: G_AIE_OFFSET_STORE [[C1]](s32), [[COPY]](p0), [[C2]](s20) :: (store (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_3D_STORE]](p0) %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 %2:_(s32) = G_CONSTANT i32 192 @@ -61,11 +60,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: %lZero:_(s32), %7:_(p0), %8:_(s20), %9:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: %lZero:_(s32), %5:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C1]](s20) :: (load (s32)) - ; CHECK-NEXT: %lOne:_(s32) = G_LOAD %5(p0) :: (load (s32)) - ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[INT]](p0), implicit %lZero(s32), implicit %lOne(s32) + ; CHECK-NEXT: %lOne:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit %7(p0), implicit %lZero(s32), implicit %lOne(s32) %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 %2:_(s32) = G_CONSTANT i32 192 diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir index d12dca151822..72097086bdc2 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir @@ -28,11 +28,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: %sZero:_(s32), %4:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) - ; CHECK-NEXT: %sOne:_(s32) = G_LOAD %4(p0) :: (load (s32)) - ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) + ; CHECK-NEXT: %sZero:_(s32), %6:_(p0), %7:_(s20), %8:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) + ; CHECK-NEXT: %sOne:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD %sZero, %sOne - ; CHECK-NEXT: G_STORE [[ADD]](s32), [[INT]](p0) :: (store (s32)) + ; CHECK-NEXT: G_STORE [[ADD]](s32), %6(p0) :: (store (s32)) ; CHECK-NEXT: PseudoRET implicit $lr %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir index dc23545aaa57..8f01580ae760 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir @@ -29,13 +29,11 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY]], [[C]](s20) :: (load (s32)) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[AIE_POSTINC_LOAD1]](p0) :: (load (s32)) - ; CHECK-NEXT: [[AIE_POSTINC_LOAD2:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD3:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[COPY1]], [[C]](s20) :: (load (s32)) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[AIE_POSTINC_LOAD3]](p0) :: (load (s32)) - ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p0), [[INT1:%[0-9]+]]:_(s20), [[INT2:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) - ; CHECK-NEXT: [[INT3:%[0-9]+]]:_(p0), [[INT4:%[0-9]+]]:_(s20), [[INT5:%[0-9]+]]:_(s20) = G_INTRINSIC intrinsic(@llvm.aie2p.add.3d), [[COPY1]](p0), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20), [[C]](s20) - ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_LOAD]](s32), implicit [[LOAD]](s32), implicit [[AIE_POSTINC_LOAD2]](s32), implicit [[LOAD1]](s32), implicit [[INT]](p0), implicit [[INT3]](p0) + ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD4:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_LOAD5:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD6:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD7:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) + ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY1]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_3D_LOAD]](s32), implicit [[AIE_OFFSET_LOAD]](s32), implicit [[AIE_POSTINC_3D_LOAD4]](s32), implicit [[AIE_OFFSET_LOAD1]](s32), implicit [[AIE_POSTINC_3D_LOAD1]](p0), implicit [[AIE_POSTINC_3D_LOAD5]](p0) %0:_(p0) = COPY $p0 %1:_(p0) = COPY $p1 %2:_(s20) = G_CONSTANT i20 8 diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/prologepilog-tail-call-opt.ll b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/prologepilog-tail-call-opt.ll index 157245118c1b..6fd0e27cd8a3 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/prologepilog-tail-call-opt.ll +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/prologepilog-tail-call-opt.ll @@ -40,8 +40,8 @@ define dso_local void @_Z5test2PPv(ptr nocapture readonly %args) local_unnamed_a ; CHECK: bb.0.entry: ; CHECK-NEXT: liveins: $p0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $p1, renamable $p0 = LDA_dms_lda_pstm_nrm_imm killed renamable $p0, 4 :: (load (p0) from %ir.args, align 4) - ; CHECK-NEXT: renamable $p0 = LDA_dms_lda_idx_imm killed renamable $p0, 0 :: (load (p0) from %ir.arrayidx1, align 4) + ; CHECK-NEXT: renamable $p1 = LDA_dms_lda_idx_imm renamable $p0, 0 :: (load (p0) from %ir.args, align 4) + ; CHECK-NEXT: renamable $p0 = LDA_dms_lda_idx_imm killed renamable $p0, 4 :: (load (p0) from %ir.arrayidx1, align 4) ; CHECK-NEXT: renamable $r1 = LDA_dms_lda_idx_imm killed renamable $p1, 0 :: (load (s32) from %ir.0) ; CHECK-NEXT: renamable $r2 = LDA_dms_lda_idx_imm killed renamable $p0, 0 :: (load (s32) from %ir.2) ; CHECK-NEXT: PseudoJ_TCO_jump_imm @_Z4funcii, csr_aie2p, implicit $r1, implicit $r2 diff --git a/llvm/test/CodeGen/AIE/aie2p/Memops.ll b/llvm/test/CodeGen/AIE/aie2p/Memops.ll index bf1305c7cc66..7920f8de0e71 100644 --- a/llvm/test/CodeGen/AIE/aie2p/Memops.ll +++ b/llvm/test/CodeGen/AIE/aie2p/Memops.ll @@ -20,15 +20,15 @@ define dso_local void @lowerMemcpyUsingWord() local_unnamed_addr #0 { ; CHECK-NEXT: lda r1, [p0], #4 ; CHECK-NEXT: lda r2, [p0], #4 ; CHECK-NEXT: lda r3, [p0], #4 -; CHECK-NEXT: lda r4, [p0], #4 -; CHECK-NEXT: lda r5, [p0, #0] +; CHECK-NEXT: lda r4, [p0, #0] +; CHECK-NEXT: lda r5, [p0, #4] ; CHECK-NEXT: movxm p0, ##buffer1 ; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: st r1, [p0], #4; ret lr ; CHECK-NEXT: st r2, [p0], #4 // Delay Slot 5 ; CHECK-NEXT: st r3, [p0], #4 // Delay Slot 4 -; CHECK-NEXT: st r4, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r5, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r4, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r5, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 4 dereferenceable(24) @buffer1, ptr noundef nonnull align 4 dereferenceable(24) @buffer2, i32 24, i1 false) @@ -73,16 +73,16 @@ define dso_local void @lowerMemcpyUsingHalfByte() local_unnamed_addr #0 { ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopa ; nopb ; movxm p0, ##buffer2; nops -; CHECK-NEXT: lda.s16 r0, [p0], #2; movxm p1, ##buffer1 -; CHECK-NEXT: st.s16 r0, [p1], #2 +; CHECK-NEXT: lda.s16 r0, [p0, #0]; movxm p1, ##buffer1 +; CHECK-NEXT: st.s16 r0, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s8 r0, [p0, #0] -; CHECK-NEXT: st.s8 r0, [p1, #0] +; CHECK-NEXT: lda.s8 r0, [p0, #2] +; CHECK-NEXT: st.s8 r0, [p1, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 @@ -101,31 +101,37 @@ define dso_local void @lowerMemcpyUsingWordHalfByte() local_unnamed_addr #0 { ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopa ; nopb ; nops ; movxm p0, ##(buffer2+8); nopv -; CHECK-NEXT: lda.s16 r0, [p0, #0]; nopb ; nopx -; CHECK-NEXT: lda r1, [p0, #-4]; movxm p1, ##(buffer1+8) +; CHECK-NEXT: lda.s16 r0, [p0, #0]; nopb ; movxm p1, ##(buffer1+8) ; CHECK-NEXT: st.s16 r0, [p1, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: mova m0, #-8 +; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: lda r0, [p0], #4 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s8 r0, [p0, #2] -; CHECK-NEXT: st.s8 r0, [p1, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: padda [p1], m0 +; CHECK-NEXT: lda.s8 r0, [p0, #6]; st r0, [p1], #4 +; CHECK-NEXT: st.s8 r0, [p1, #6] +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r0, [p0, #-8] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: lda r0, [p0, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st r1, [p1, #-4] // Delay Slot 3 -; CHECK-NEXT: st r0, [p1, #-8] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: st r0, [p1, #0] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 4 dereferenceable(11) @buffer1, ptr noundef nonnull align 4 dereferenceable(11) @buffer2, i32 11, i1 false) @@ -159,15 +165,15 @@ define dso_local void @lowerMemcpyUsingWordVector16() local_unnamed_addr #0 { ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: movxm p0, ##buffer2 ; CHECK-NEXT: vldb.128 wh0, [p0], #16; nopx -; CHECK-NEXT: vldb.128 wh2, [p0], #16 -; CHECK-NEXT: lda r0, [p0, #0] +; CHECK-NEXT: vldb.128 wh2, [p0, #0] +; CHECK-NEXT: lda r0, [p0, #16] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr ; CHECK-NEXT: movxm p0, ##buffer1 // Delay Slot 5 ; CHECK-NEXT: vst.128 wh0, [p0], #16 // Delay Slot 4 -; CHECK-NEXT: vst.128 wh2, [p0], #16 // Delay Slot 3 -; CHECK-NEXT: st r0, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: vst.128 wh2, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r0, [p0, #16] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 16 dereferenceable(36) @buffer1, ptr noundef nonnull align 16 dereferenceable(36) @buffer2, i32 36, i1 false) @@ -183,15 +189,15 @@ define dso_local void @lowerMemcpyUsingWordVector32() local_unnamed_addr #0 { ; CHECK-NEXT: vldb wh0, [p0], #32 ; CHECK-NEXT: lda r0, [p0], #4 ; CHECK-NEXT: lda r1, [p0], #4 -; CHECK-NEXT: lda r2, [p0], #4 -; CHECK-NEXT: lda r3, [p0, #0] +; CHECK-NEXT: lda r2, [p0, #0] +; CHECK-NEXT: lda r3, [p0, #4] ; CHECK-NEXT: nop ; CHECK-NEXT: movxm p0, ##buffer1 ; CHECK-NEXT: vst wh0, [p0], #32; ret lr ; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 5 ; CHECK-NEXT: st r1, [p0], #4 // Delay Slot 4 -; CHECK-NEXT: st r2, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r3, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r2, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r3, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 32 dereferenceable(48) @buffer1, ptr noundef nonnull align 32 dereferenceable(48) @buffer2, i32 48, i1 false) @@ -207,15 +213,15 @@ define dso_local void @lowerMemcpyUsingWordVector64() local_unnamed_addr #0 { ; CHECK-NEXT: vldb x0, [p0], #64 ; CHECK-NEXT: lda r0, [p0], #4 ; CHECK-NEXT: lda r1, [p0], #4 -; CHECK-NEXT: lda r2, [p0], #4 -; CHECK-NEXT: lda r3, [p0, #0] +; CHECK-NEXT: lda r2, [p0, #0] +; CHECK-NEXT: lda r3, [p0, #4] ; CHECK-NEXT: nop ; CHECK-NEXT: movxm p0, ##buffer1 ; CHECK-NEXT: vst x0, [p0], #64; ret lr ; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 5 ; CHECK-NEXT: st r1, [p0], #4 // Delay Slot 4 -; CHECK-NEXT: st r2, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r3, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r2, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r3, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 64 dereferenceable(80) @buffer1, ptr noundef nonnull align 64 dereferenceable(80) @buffer2, i32 80, i1 false) @@ -263,8 +269,8 @@ define dso_local void @lowerMemsetUsingWordVector32() local_unnamed_addr #2 { ; CHECK-NEXT: vst wl0, [p0], #32; ret lr ; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 5 ; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 4 -; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r0, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r0, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r0, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memset.p0.i32(ptr noundef nonnull align 32 dereferenceable(48) @buffer1, i8 0, i32 48, i1 false) @@ -282,8 +288,8 @@ define dso_local void @lowerMemsetUsingWordVector64() local_unnamed_addr #2 { ; CHECK-NEXT: vst x0, [p0], #64; ret lr ; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 5 ; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 4 -; CHECK-NEXT: st r0, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r0, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r0, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r0, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: tail call void @llvm.memset.p0.i32(ptr noundef nonnull align 64 dereferenceable(80) @buffer1, i8 0, i32 80, i1 false) diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll index 88242ad52468..815b64974884 100644 --- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll @@ -16,24 +16,26 @@ define weak_odr dso_local void @convert_bf16_to_bfp16(ptr noalias %in, ptr noali ; CHECK-LABEL: convert_bf16_to_bfp16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda dn0, [p2, #4]; nopx -; CHECK-NEXT: lda m1, [p2, #8] +; CHECK-NEXT: lda r0, [p2, #0]; nopb ; nops ; nopx ; mov m0, #4; nopv +; CHECK-NEXT: padda [p2], m0; nopx +; CHECK-NEXT: lda dn0, [p2], #4 +; CHECK-NEXT: lda m1, [p2, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova r24, #0 +; CHECK-NEXT: movx r24, #0 ; CHECK-NEXT: mova dj0, #0; mov r26, r24 -; CHECK-NEXT: lda r0, [p2, #0]; vldb.fill.512 [p0, lf0, r24]; mov dj1, dj0 +; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24]; mov dj1, dj0 ; CHECK-NEXT: movs dc1, dj0; vldb.pop.512 x0, [p0, lf0, r24]; mov dn1, dn0 ; CHECK-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1] ; CHECK-NEXT: nop ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] -; CHECK-NEXT: lda m0, [p2, #12]; vldb.pop.512 x0, [p0, lf0, r24]; movxm ls, #.LBB0_1 +; CHECK-NEXT: lda m0, [p2, #4]; vldb.pop.512 x0, [p0, lf0, r24]; movxm ls, #.LBB0_1 ; CHECK-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; movxm le, #.L_LEnd0 ; CHECK-NEXT: add.nc lc, r0, #-3 ; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; nops ; nopx ; vconv.fp32.bf16 cml0, x0; nopv ; CHECK-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; nops ; nopx ; vconv.fp32.bf16 cmh0, x2; nopv -; CHECK-NEXT: nopa ; nopb ; movs p2, p1; nopx ; mov dc0, dj0; nopv +; CHECK-NEXT: nopa ; nopb ; movs dc0, dj0; nopx ; mov p2, p1; nopv ; CHECK-NEXT: // implicit-def: $sf ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body diff --git a/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll index 24a3c7b91cf1..0750cf10c5f6 100644 --- a/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll +++ b/llvm/test/CodeGen/AIE/aie2p/fifo-loads.ll @@ -17,14 +17,13 @@ define dso_local void @_Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t(ptr nocap ; CHECK-LABEL: _Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov p3, p1 -; CHECK-NEXT: vlda lfl0, [p3], #128 -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0]; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] ; CHECK-NEXT: nop @@ -32,7 +31,7 @@ define dso_local void @_Z17test_fifo_ld_fillRPDv64_DB8_R12fifo_state_t(ptr nocap ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 @@ -56,14 +55,13 @@ define dso_local noundef <64 x i8> @_Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state ; CHECK-LABEL: _Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov p3, p1 -; CHECK-NEXT: vlda lfl0, [p3], #128 -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0]; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -71,7 +69,7 @@ define dso_local noundef <64 x i8> @_Z16test_fifo_ld_popRPDv64_DB8_R12fifo_state ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 @@ -96,14 +94,13 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fi ; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv -; CHECK-NEXT: vlda lfl0, [p3], #128; nopb ; nopx -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 ; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24, m0] ; CHECK-NEXT: nop @@ -111,7 +108,7 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_1d_byteRPDv64_DB8_R12fi ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 @@ -137,15 +134,14 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_2d_byteRPDv64_DB8_R12fi ; CHECK-LABEL: _Z24test_fifo_ld_pop_2d_byteRPDv64_DB8_R12fifo_state_tiiRii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopx -; CHECK-NEXT: lda dc0, [p2, #0]; mov p4, p1 -; CHECK-NEXT: vlda lfl0, [p4], #128 -; CHECK-NEXT: lda r24, [p4, #0] +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nopx +; CHECK-NEXT: lda dc0, [p2, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p3, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: movs p3, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vldb.pop.512.2d x0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop @@ -153,7 +149,7 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_2d_byteRPDv64_DB8_R12fi ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr -; CHECK-NEXT: st r24, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 @@ -186,16 +182,15 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_3d_byteRPDv64_DB8_R12fi ; CHECK-LABEL: _Z24test_fifo_ld_pop_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopxm +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nopxm ; CHECK-NEXT: lda dc0, [p2, #0] -; CHECK-NEXT: lda dc4, [p3, #0]; mov p5, p1 -; CHECK-NEXT: vlda lfl0, [p5], #128 -; CHECK-NEXT: lda r24, [p5, #0] +; CHECK-NEXT: lda dc4, [p3, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] ; CHECK-NEXT: nop -; CHECK-NEXT: movs p4, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: movs p4, p0 ; CHECK-NEXT: mov m0, r0 -; CHECK-NEXT: movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj4, r4 ; CHECK-NEXT: vldb.pop.512.3d x0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop @@ -203,7 +198,7 @@ define dso_local noundef <64 x i8> @_Z24test_fifo_ld_pop_3d_byteRPDv64_DB8_R12fi ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: st r24, [p5, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 @@ -244,14 +239,13 @@ define dso_local %struct.v64bfp16ebs8 @_Z16test_fifo_ld_popRP22v64bfp16ebs8_unal ; CHECK-LABEL: _Z16test_fifo_ld_popRP22v64bfp16ebs8_unalignedR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov p3, p1 -; CHECK-NEXT: vlda lfl0, [p3], #128 -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0]; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -259,7 +253,7 @@ define dso_local %struct.v64bfp16ebs8 @_Z16test_fifo_ld_popRP22v64bfp16ebs8_unal ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 @@ -287,14 +281,13 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRP22v64bfp16e ; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv -; CHECK-NEXT: vlda lfl0, [p3], #128; nopb ; nopx -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24, m0] ; CHECK-NEXT: nop @@ -302,7 +295,7 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRP22v64bfp16e ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 @@ -331,15 +324,14 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRP22v64bfp16e ; CHECK-LABEL: _Z24test_fifo_ld_pop_2d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_tiiRii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopx -; CHECK-NEXT: lda dc0, [p2, #0]; mov p4, p1 -; CHECK-NEXT: vlda lfl0, [p4], #128 -; CHECK-NEXT: lda r24, [p4, #0] +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nopx +; CHECK-NEXT: lda dc0, [p2, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p3, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: movs p3, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vldb.pop.576.2d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop @@ -347,7 +339,7 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRP22v64bfp16e ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr -; CHECK-NEXT: st r24, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 @@ -383,16 +375,15 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRP22v64bfp16e ; CHECK-LABEL: _Z24test_fifo_ld_pop_3d_byteRP22v64bfp16ebs8_unalignedR12fifo_state_tiiRiiiS4_i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopxm +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nopxm ; CHECK-NEXT: lda dc0, [p2, #0] -; CHECK-NEXT: lda dc4, [p3, #0]; mov p5, p1 -; CHECK-NEXT: vlda lfl0, [p5], #128 -; CHECK-NEXT: lda r24, [p5, #0] +; CHECK-NEXT: lda dc4, [p3, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] ; CHECK-NEXT: nop -; CHECK-NEXT: movs p4, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: movs p4, p0 ; CHECK-NEXT: mov m0, r0 -; CHECK-NEXT: movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj4, r4 ; CHECK-NEXT: vldb.pop.576.3d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop @@ -400,7 +391,7 @@ define dso_local %struct.v64bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRP22v64bfp16e ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: st r24, [p5, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 @@ -444,14 +435,13 @@ define dso_local %struct.v64bfp16ebs16 @_Z16test_fifo_ld_popRP23v64bfp16ebs16_un ; CHECK-LABEL: _Z16test_fifo_ld_popRP23v64bfp16ebs16_unalignedR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov p3, p1 -; CHECK-NEXT: vlda lfl0, [p3], #128 -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0]; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.544 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -459,7 +449,7 @@ define dso_local %struct.v64bfp16ebs16 @_Z16test_fifo_ld_popRP23v64bfp16ebs16_un ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 @@ -487,14 +477,13 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_1d_byteRP23v64bfp16 ; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv -; CHECK-NEXT: vlda lfl0, [p3], #128; nopb ; nopx -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; mov m0, r0 ; CHECK-NEXT: vldb.pop.544 ex0, [p0, lf0, r24, m0] ; CHECK-NEXT: nop @@ -502,7 +491,7 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_1d_byteRP23v64bfp16 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 @@ -531,15 +520,14 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_2d_byteRP23v64bfp16 ; CHECK-LABEL: _Z24test_fifo_ld_pop_2d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_tiiRii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopx -; CHECK-NEXT: lda dc0, [p2, #0]; mov p4, p1 -; CHECK-NEXT: vlda lfl0, [p4], #128 -; CHECK-NEXT: lda r24, [p4, #0] +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nopx +; CHECK-NEXT: lda dc0, [p2, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p3, p0 ; CHECK-NEXT: nop -; CHECK-NEXT: mov m0, r0 +; CHECK-NEXT: movs p3, p0 +; CHECK-NEXT: vlda lfl0, [p1, #0]; mov m0, r0 ; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vldb.pop.544.2d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop @@ -547,7 +535,7 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_2d_byteRP23v64bfp16 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr -; CHECK-NEXT: st r24, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 @@ -583,16 +571,15 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_3d_byteRP23v64bfp16 ; CHECK-LABEL: _Z24test_fifo_ld_pop_3d_byteRP23v64bfp16ebs16_unalignedR12fifo_state_tiiRiiiS4_i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopxm +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nopxm ; CHECK-NEXT: lda dc0, [p2, #0] -; CHECK-NEXT: lda dc4, [p3, #0]; mov p5, p1 -; CHECK-NEXT: vlda lfl0, [p5], #128 -; CHECK-NEXT: lda r24, [p5, #0] +; CHECK-NEXT: lda dc4, [p3, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] ; CHECK-NEXT: nop -; CHECK-NEXT: movs p4, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: movs p4, p0 ; CHECK-NEXT: mov m0, r0 -; CHECK-NEXT: movs dn0, r1; mov dj0, r2 +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs dn0, r1; mov dj0, r2 ; CHECK-NEXT: vlda lfh0, [p1, #64]; movs dn4, r3; mov dj4, r4 ; CHECK-NEXT: vldb.pop.544.3d ex0, [p0, lf0, r24, d0] ; CHECK-NEXT: nop @@ -600,7 +587,7 @@ define dso_local %struct.v64bfp16ebs16 @_Z24test_fifo_ld_pop_3d_byteRP23v64bfp16 ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: st r24, [p5, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 @@ -643,14 +630,13 @@ define dso_local noundef <64 x i8> @_Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t ; CHECK-LABEL: _Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopx ; mov p3, p1 -; CHECK-NEXT: vlda lfl0, [p3], #128 -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nopx ; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64] +; CHECK-NEXT: vlda lfl0, [p1, #0] +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0 ; CHECK-NEXT: vlda lfe, [p1, #192]; movxm r30, #2015 ; CHECK-NEXT: vldb.popx.512 x0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -658,7 +644,7 @@ define dso_local noundef <64 x i8> @_Z17test_fifo_ld_popxRPDv64_hR12fifo_state_t ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r24, [p3, #0]; ret lr +; CHECK-NEXT: st r24, [p1, dj0]; ret lr ; CHECK-NEXT: vst lfe, [p1, #192] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 @@ -688,14 +674,13 @@ define dso_local void @_Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_s ; CHECK-LABEL: _Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_state_tii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopx ; mov p3, p1 -; CHECK-NEXT: vlda lfl0, [p3], #128 -; CHECK-NEXT: lda r24, [p3, #0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda p0, [p0, #0]; nopx ; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova r2, #6; movs p2, p0 -; CHECK-NEXT: vlda lfh0, [p1, #64]; lshl r0, r0, r2 +; CHECK-NEXT: vlda lfl0, [p1, #0] +; CHECK-NEXT: mova r2, #6 +; CHECK-NEXT: vlda lfh0, [p1, #64]; movs p2, p0; lshl r0, r0, r2 ; CHECK-NEXT: vlda lfe, [p1, #192]; or r30, r0, r1 ; CHECK-NEXT: vldb.fillx.512 [p0, lf0, r24] ; CHECK-NEXT: nop @@ -703,7 +688,7 @@ define dso_local void @_Z18test_fifo_ld_fillxRP22v64bfp16ebs8_unalignedR12fifo_s ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r24, [p3, #0]; ret lr +; CHECK-NEXT: st r24, [p1, dj0]; ret lr ; CHECK-NEXT: vst lfe, [p1, #192] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 @@ -772,14 +757,13 @@ define dso_local void @_Z17test_fifo_ld_fillRrP23v128bfp16ebs8_unalignedR12fifo_ ; CHECK-LABEL: _Z17test_fifo_ld_fillRrP23v128bfp16ebs8_unalignedR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov p3, p1 -; CHECK-NEXT: vlda lfl0, [p3], #128 -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0]; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] ; CHECK-NEXT: nop @@ -787,7 +771,7 @@ define dso_local void @_Z17test_fifo_ld_fillRrP23v128bfp16ebs8_unalignedR12fifo_ ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 @@ -811,14 +795,13 @@ define dso_local %struct.v128bfp16ebs8 @_Z16test_fifo_ld_popRrP23v128bfp16ebs8_u ; CHECK-LABEL: _Z16test_fifo_ld_popRrP23v128bfp16ebs8_unalignedR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv -; CHECK-NEXT: vlda lfl0, [p3], #128; nopb ; nopx -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r24, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -826,11 +809,11 @@ define dso_local %struct.v128bfp16ebs8 @_Z16test_fifo_ld_popRrP23v128bfp16ebs8_u ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r24, [p3, #0] +; CHECK-NEXT: st r24, [p1, dj0] ; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] ; CHECK-NEXT: st p0, [p2, #0] -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda r24, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -843,7 +826,7 @@ define dso_local %struct.v128bfp16ebs8 @_Z16test_fifo_ld_popRrP23v128bfp16ebs8_u ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 @@ -884,14 +867,13 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRrP23v128bfp ; CHECK-LABEL: _Z24test_fifo_ld_pop_1d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv -; CHECK-NEXT: vlda lfl0, [p3], #128; nopx -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nops ; nopx ; mov dj0, #128; nopv +; CHECK-NEXT: lda r24, [p1, dj0]; nopb ; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p2, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p2, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -899,11 +881,11 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r24, [p3, #0] +; CHECK-NEXT: st r24, [p1, dj0] ; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] ; CHECK-NEXT: st p0, [p2, #0] -; CHECK-NEXT: lda r24, [p3, #0] +; CHECK-NEXT: lda r24, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -916,7 +898,7 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_1d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r24, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p2, #0] // Delay Slot 2 @@ -958,14 +940,13 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRrP23v128bfp ; CHECK-LABEL: _Z24test_fifo_ld_pop_2d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_tiiRii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; mov p4, p1 -; CHECK-NEXT: vlda lfl0, [p4], #128 -; CHECK-NEXT: lda r24, [p4, #0] +; CHECK-NEXT: lda p0, [p0, #0]; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1]; nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p3, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p3, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -973,11 +954,11 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r24, [p4, #0] +; CHECK-NEXT: st r24, [p1, dj1] ; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] ; CHECK-NEXT: st p0, [p3, #0] -; CHECK-NEXT: lda r24, [p4, #0] +; CHECK-NEXT: lda r24, [p1, dj1] ; CHECK-NEXT: lda dc0, [p2, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -991,7 +972,7 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_2d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0]; ret lr -; CHECK-NEXT: st r24, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p3, #0] // Delay Slot 2 @@ -1040,14 +1021,13 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRrP23v128bfp ; CHECK-LABEL: _Z24test_fifo_ld_pop_3d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_tiiRiiiS5_i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p0, [p0, #0]; nopx ; mov p5, p1 -; CHECK-NEXT: vlda lfl0, [p5], #128 -; CHECK-NEXT: lda r24, [p5, #0] +; CHECK-NEXT: lda p0, [p0, #0]; nopb ; nopx ; mov dj1, #128 +; CHECK-NEXT: lda r24, [p1, dj1] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: movs p4, p0 ; CHECK-NEXT: nop +; CHECK-NEXT: vlda lfl0, [p1, #0]; movs p4, p0 ; CHECK-NEXT: vlda lfh0, [p1, #64] ; CHECK-NEXT: vldb.pop.576 ex0, [p0, lf0, r24] ; CHECK-NEXT: nop @@ -1055,11 +1035,11 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r24, [p5, #0] +; CHECK-NEXT: st r24, [p1, dj1] ; CHECK-NEXT: vst lfl0, [p1, #0] ; CHECK-NEXT: vst lfh0, [p1, #64] ; CHECK-NEXT: st p0, [p4, #0] -; CHECK-NEXT: lda r24, [p5, #0] +; CHECK-NEXT: lda r24, [p1, dj1] ; CHECK-NEXT: lda dc0, [p2, #0] ; CHECK-NEXT: lda dc4, [p3, #0] ; CHECK-NEXT: nop @@ -1074,7 +1054,7 @@ define dso_local %struct.v128bfp16ebs8 @_Z24test_fifo_ld_pop_3d_byteRrP23v128bfp ; CHECK-NEXT: nop ; CHECK-NEXT: st dc0, [p2, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: st r24, [p5, #0] // Delay Slot 5 +; CHECK-NEXT: st r24, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst lfl0, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst lfh0, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p0, [p4, #0] // Delay Slot 2 diff --git a/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll index 0dad1049c609..e0b4620cd23e 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll +++ b/llvm/test/CodeGen/AIE/aie2p/ldst-fifo-stores.ll @@ -48,9 +48,9 @@ define dso_local void @_Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t(ptr no ; CHECK-LABEL: _Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p4, p1; nopv -; CHECK-NEXT: vlda sfl, [p4], #128 -; CHECK-NEXT: lda r26, [p4, #0] +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -59,7 +59,7 @@ define dso_local void @_Z17test_fifo_st_pushRPDv64_DB8_S0_R12fifo_state_t(ptr no ; CHECK-NEXT: nop ; CHECK-NEXT: vst.push.512 x0, [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -85,9 +85,9 @@ define dso_local void @_Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t(ptr noca ; CHECK-LABEL: _Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p4, p1; nopv -; CHECK-NEXT: vlda sfl, [p4], #128 -; CHECK-NEXT: lda r26, [p4, #0] +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -96,7 +96,7 @@ define dso_local void @_Z18test_fifo_st_flushRPDv64_DB8_R12fifo_state_t(ptr noca ; CHECK-NEXT: nop ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -121,9 +121,9 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-LABEL: _Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov p4, p1; nops -; CHECK-NEXT: vlda sfl, [p4], #128 -; CHECK-NEXT: lda r26, [p4, #0] +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -132,7 +132,7 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-NEXT: mov m0, r0 ; CHECK-NEXT: vst.flush.512 [p2, sf, r26, m0] ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -158,22 +158,21 @@ define dso_local void @_Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-LABEL: _Z26test_fifo_st_flush_2d_byteRPDv64_DB8_R12fifo_state_tiiRii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p5, [p0, #0] -; CHECK-NEXT: lda dc0, [p2, #0]; mov p3, p1 -; CHECK-NEXT: vlda sfl, [p3], #128 -; CHECK-NEXT: lda r26, [p3, #0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda p4, [p0, #0]; nopx +; CHECK-NEXT: lda dc0, [p2, #0]; mov dj1, #128 +; CHECK-NEXT: lda r26, [p1, dj1] ; CHECK-NEXT: nop +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] -; CHECK-NEXT: mov p4, p2 +; CHECK-NEXT: movs p3, p2 ; CHECK-NEXT: movs m0, r0; mov dn0, r1 -; CHECK-NEXT: movs dj0, r2; mov p2, p5 +; CHECK-NEXT: movs dj0, r2; mov p2, p4 ; CHECK-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st dc0, [p4, #0]; ret lr -; CHECK-NEXT: st r26, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st dc0, [p3, #0]; ret lr +; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -206,22 +205,22 @@ define dso_local void @_Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_ti ; CHECK-LABEL: _Z26test_fifo_st_flush_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda r5, [p0, #0]; nopxm +; CHECK-NEXT: vlda sfl, [p1, #0]; nopxm +; CHECK-NEXT: lda p5, [p0, #0] ; CHECK-NEXT: lda dc0, [p2, #0] -; CHECK-NEXT: lda dc4, [p3, #0]; mov p4, p1 -; CHECK-NEXT: vlda sfl, [p4], #128 -; CHECK-NEXT: lda r26, [p4, #0] +; CHECK-NEXT: lda dc4, [p3, #0]; mov dj1, #128 +; CHECK-NEXT: lda r26, [p1, dj1] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop -; CHECK-NEXT: mov p5, p2 +; CHECK-NEXT: movs p4, p2 ; CHECK-NEXT: movs m0, r0; mov dn0, r1 ; CHECK-NEXT: movs dj0, r2; mov dn4, r3 -; CHECK-NEXT: movs dj4, r4; mov p2, r5 +; CHECK-NEXT: movs dj4, r4; mov p2, p5 ; CHECK-NEXT: vst.flush.512.3d [p2, sf, r26, d0] ; CHECK-NEXT: nop -; CHECK-NEXT: st dc0, [p5, #0] +; CHECK-NEXT: st dc0, [p4, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -261,11 +260,10 @@ define dso_local void @_Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t(ptr ; CHECK-LABEL: _Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov p4, p1 -; CHECK-NEXT: vlda sfl, [p4], #128 -; CHECK-NEXT: lda r26, [p4, #0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128; nops +; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: nop +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -274,7 +272,7 @@ define dso_local void @_Z23test_fifo_st_flush_convRPDv64_DB8_R12fifo_state_t(ptr ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -299,11 +297,10 @@ define dso_local void @_Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_sta ; CHECK-LABEL: _Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopx ; mov p4, p1 -; CHECK-NEXT: vlda sfl, [p4], #128 -; CHECK-NEXT: lda r26, [p4, #0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nopx ; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: nop +; CHECK-NEXT: vlda sfl, [p1, #0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -312,7 +309,7 @@ define dso_local void @_Z31test_fifo_st_flush_conv_1d_byteRPDv64_DB8_R12fifo_sta ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -338,20 +335,20 @@ define dso_local void @_Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_sta ; CHECK-LABEL: _Z31test_fifo_st_flush_conv_2d_byteRPDv64_DB8_R12fifo_state_tiiRii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p5, [p0, #0]; nopb ; nopx -; CHECK-NEXT: lda dc0, [p2, #0]; mov p3, p1 -; CHECK-NEXT: vlda sfl, [p3], #128 -; CHECK-NEXT: lda r26, [p3, #0] +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopx +; CHECK-NEXT: lda p4, [p0, #0] +; CHECK-NEXT: lda dc0, [p2, #0]; mov dj1, #128 +; CHECK-NEXT: lda r26, [p1, dj1] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p4, p2 +; CHECK-NEXT: movs p3, p2 ; CHECK-NEXT: movs m0, r0; mov dn0, r1 -; CHECK-NEXT: movs dj0, r2; mov p2, p5 +; CHECK-NEXT: movs dj0, r2; mov p2, p4 ; CHECK-NEXT: vst.flush.512.2d [p2, sf, r26, d0] ; CHECK-NEXT: nop -; CHECK-NEXT: st dc0, [p4, #0]; ret lr -; CHECK-NEXT: st r26, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st dc0, [p3, #0]; ret lr +; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -384,24 +381,23 @@ define dso_local void @_Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_sta ; CHECK-LABEL: _Z31test_fifo_st_flush_conv_3d_byteRPDv64_DB8_R12fifo_state_tiiRiiiS5_i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda r5, [p0, #0]; nopx +; CHECK-NEXT: lda p5, [p0, #0]; nopb ; nopx ; CHECK-NEXT: lda dc0, [p2, #0] -; CHECK-NEXT: lda dc4, [p3, #0]; mov p4, p1 -; CHECK-NEXT: vlda sfl, [p4], #128 -; CHECK-NEXT: lda r26, [p4, #0] -; CHECK-NEXT: nop +; CHECK-NEXT: lda dc4, [p3, #0]; mov dj1, #128 +; CHECK-NEXT: lda r26, [p1, dj1] ; CHECK-NEXT: nop -; CHECK-NEXT: vlda sfh, [p1, #64]; mov p5, p2 +; CHECK-NEXT: vlda sfl, [p1, #0] +; CHECK-NEXT: vlda sfh, [p1, #64]; movs p4, p2 ; CHECK-NEXT: movs m0, r0; mov dn0, r1 ; CHECK-NEXT: movs dj0, r2; mov dn4, r3 -; CHECK-NEXT: movs dj4, r4; mov p2, r5 +; CHECK-NEXT: movs dj4, r4; mov p2, p5 ; CHECK-NEXT: vst.flush.512.conv.3d [p2, sf, r26, d0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st dc0, [p5, #0] +; CHECK-NEXT: st dc0, [p4, #0] ; CHECK-NEXT: st dc4, [p3, #0]; ret lr -; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj1] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -474,9 +470,9 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs16(ptr nocapture nonnull ali ; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p4, p1; nopv -; CHECK-NEXT: vlda sfl, [p4], #128 -; CHECK-NEXT: lda r26, [p4, #0] +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -485,7 +481,7 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs16(ptr nocapture nonnull ali ; CHECK-NEXT: nop ; CHECK-NEXT: vst.push.544 ex0, [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -544,9 +540,9 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs8(ptr nocapture nonnull alig ; CHECK-LABEL: test_fifo_st_push_v64bfp16ebs8: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p4, p1; nopv -; CHECK-NEXT: vlda sfl, [p4], #128 -; CHECK-NEXT: lda r26, [p4, #0] +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nopxm ; nops +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -555,7 +551,7 @@ define dso_local void @test_fifo_st_push_v64bfp16ebs8(ptr nocapture nonnull alig ; CHECK-NEXT: nop ; CHECK-NEXT: vst.push.576 ex0, [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r26, [p4, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -582,9 +578,9 @@ define dso_local void @_Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bf ; CHECK-LABEL: _Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bfp16ebs8R12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv -; CHECK-NEXT: vlda sfl, [p3], #128; nopb ; nopx -; CHECK-NEXT: lda r26, [p3, #0] +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0]; nopx ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -593,12 +589,12 @@ define dso_local void @_Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bf ; CHECK-NEXT: nop ; CHECK-NEXT: vst.push.576 ex0, [p2, sf, r26] ; CHECK-NEXT: nop -; CHECK-NEXT: st r26, [p3, #0] +; CHECK-NEXT: st r26, [p1, dj0] ; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: vst sfh, [p1, #64] ; CHECK-NEXT: st p2, [p0, #0] ; CHECK-NEXT: vlda sfl, [p1, #0] -; CHECK-NEXT: lda r26, [p3, #0] +; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -607,7 +603,7 @@ define dso_local void @_Z17test_fifo_st_pushRrP23v128bfp16ebs8_unaligned13v128bf ; CHECK-NEXT: nop ; CHECK-NEXT: vst.push.576 ex1, [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r26, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -644,9 +640,9 @@ define dso_local void @_Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-LABEL: _Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo_state_t: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv -; CHECK-NEXT: vlda sfl, [p3], #128; nopb ; nopx -; CHECK-NEXT: lda r26, [p3, #0] +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0]; nopx ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -655,12 +651,12 @@ define dso_local void @_Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-NEXT: nop ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: nop -; CHECK-NEXT: st r26, [p3, #0] +; CHECK-NEXT: st r26, [p1, dj0] ; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: vst sfh, [p1, #64] ; CHECK-NEXT: st p2, [p0, #0] ; CHECK-NEXT: vlda sfl, [p1, #0] -; CHECK-NEXT: lda r26, [p3, #0] +; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -669,7 +665,7 @@ define dso_local void @_Z18test_fifo_st_flushRrP23v128bfp16ebs8_unalignedR12fifo ; CHECK-NEXT: nop ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r26, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 @@ -702,9 +698,9 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unaligne ; CHECK-LABEL: _Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unalignedR12fifo_state_ti: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: lda p2, [p0, #0]; nopb ; nops ; nopx ; mov p3, p1; nopv -; CHECK-NEXT: vlda sfl, [p3], #128; nopx -; CHECK-NEXT: lda r26, [p3, #0] +; CHECK-NEXT: vlda sfl, [p1, #0]; nopb ; nops ; nopxm ; nopv +; CHECK-NEXT: lda p2, [p0, #0]; mov dj0, #128 +; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -713,12 +709,12 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unaligne ; CHECK-NEXT: nop ; CHECK-NEXT: vst.flush.512 [p2, sf, r26] ; CHECK-NEXT: nop -; CHECK-NEXT: st r26, [p3, #0] +; CHECK-NEXT: st r26, [p1, dj0] ; CHECK-NEXT: vst sfl, [p1, #0] ; CHECK-NEXT: vst sfh, [p1, #64] ; CHECK-NEXT: st p2, [p0, #0] ; CHECK-NEXT: vlda sfl, [p1, #0] -; CHECK-NEXT: lda r26, [p3, #0] +; CHECK-NEXT: lda r26, [p1, dj0] ; CHECK-NEXT: vlda sfh, [p1, #64] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -727,7 +723,7 @@ define dso_local void @_Z26test_fifo_st_flush_1d_byteRrP23v128bfp16ebs8_unaligne ; CHECK-NEXT: mov m0, r0 ; CHECK-NEXT: vst.flush.512 [p2, sf, r26, m0] ; CHECK-NEXT: ret lr -; CHECK-NEXT: st r26, [p3, #0] // Delay Slot 5 +; CHECK-NEXT: st r26, [p1, dj0] // Delay Slot 5 ; CHECK-NEXT: vst sfl, [p1, #0] // Delay Slot 4 ; CHECK-NEXT: vst sfh, [p1, #64] // Delay Slot 3 ; CHECK-NEXT: st p2, [p0, #0] // Delay Slot 2 diff --git a/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll b/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll index 51bd16ec3b22..4bae794dd7af 100644 --- a/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll +++ b/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll @@ -16,70 +16,68 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-LABEL: test_load_store_unaligned: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova m0, #-600; nopx -; CHECK-NEXT: paddxm [sp], #640 -; CHECK-NEXT: mov p1, sp +; CHECK-NEXT: mova m0, #-560; nopb ; nopxm ; nops +; CHECK-NEXT: paddxm [sp], #576 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov p2, sp -; CHECK-NEXT: mov p4, sp -; CHECK-NEXT: mov p3, sp -; CHECK-NEXT: mov p5, sp -; CHECK-NEXT: st p6, [sp, #-636] // 4-byte Folded Spill +; CHECK-NEXT: st p6, [sp, #-572] // 4-byte Folded Spill ; CHECK-NEXT: mov p6, sp -; CHECK-NEXT: st r8, [sp, #-604] // 4-byte Folded Spill -; CHECK-NEXT: padda [p1], m0 -; CHECK-NEXT: mova m0, #-584 +; CHECK-NEXT: mov p5, sp +; CHECK-NEXT: st p7, [sp, #-576] // 4-byte Folded Spill +; CHECK-NEXT: mov p7, sp +; CHECK-NEXT: st r8, [sp, #-564] // 4-byte Folded Spill +; CHECK-NEXT: st r9, [sp, #-568] // 4-byte Folded Spill ; CHECK-NEXT: padda [p0], m0 -; CHECK-NEXT: mova m0, #-568 -; CHECK-NEXT: mov r16, p0 -; CHECK-NEXT: padda [p2], m0 ; CHECK-NEXT: mova m0, #-544 +; CHECK-NEXT: mov r17, p0 ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: padda [p4], m0 -; CHECK-NEXT: mova m0, #-480 -; CHECK-NEXT: padda [p0], #-512 -; CHECK-NEXT: mov r8, p0 +; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: mova m0, #-528 +; CHECK-NEXT: mov r16, p0 ; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: padda [p2], m0 +; CHECK-NEXT: mova m0, #-480 ; CHECK-NEXT: padda [p0], m0 -; CHECK-NEXT: mova m0, #-304 -; CHECK-NEXT: mov r25, p0 +; CHECK-NEXT: mova m0, #-416 +; CHECK-NEXT: mov r30, p0 ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: padda [p0], #-448 -; CHECK-NEXT: mov r24, p0 +; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: mova m0, #-352 +; CHECK-NEXT: mov r9, p0 ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: padda [p0], #-320 +; CHECK-NEXT: padda [p6], m0 +; CHECK-NEXT: mova m0, #-288 +; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: mova m0, #-272 ; CHECK-NEXT: mov r27, p0 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: padda [p0], m0 -; CHECK-NEXT: mova m0, #-288 +; CHECK-NEXT: mova m0, #-240 ; CHECK-NEXT: mov r26, p0 -; CHECK-NEXT: padda [p3], m0 -; CHECK-NEXT: mova m0, #-272 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: padda [p5], m0 -; CHECK-NEXT: mova m0, #-240 -; CHECK-NEXT: padda [p0], m0 ; CHECK-NEXT: mova m0, #-208 -; CHECK-NEXT: mov r29, p0 -; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: padda [p0], m0 ; CHECK-NEXT: mova m0, #-176 -; CHECK-NEXT: st p0, [sp, #-44] // 4-byte Folded Spill +; CHECK-NEXT: mov r25, p0 ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: padda [p6], m0 -; CHECK-NEXT: mova m0, #-112 ; CHECK-NEXT: padda [p0], m0 -; CHECK-NEXT: st p0, [sp, #-48] // 4-byte Folded Spill -; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: mov p1, p0 -; CHECK-NEXT: st.s16 r0, [p1], #2 +; CHECK-NEXT: mova m0, #-144 +; CHECK-NEXT: mov r24, p0 +; CHECK-NEXT: mov p0, sp +; CHECK-NEXT: padda [p7], m0 +; CHECK-NEXT: mova m0, #-80 +; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: mov r8, p0 +; CHECK-NEXT: mov p0, r17 +; CHECK-NEXT: st.s16 r0, [p0, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.32 r0, x0, #0, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r1, [p1, #0] +; CHECK-NEXT: st.s16 r1, [p0, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -116,39 +114,36 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: st.s16 r6, [p0, #12] ; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop ; CHECK-NEXT: vextract.32 r6, x0, #6, vaddsign1 +; CHECK-NEXT: nop ; CHECK-NEXT: vmov q0, wl2 +; CHECK-NEXT: st.s16 r7, [p0, #14] ; CHECK-NEXT: vextract.32 r7, x0, #7, vaddsign1 ; CHECK-NEXT: vmov wl0, q0 ; CHECK-NEXT: mov r28, p0 -; CHECK-NEXT: st.s16 r7, [p0, #14] -; CHECK-NEXT: mov r31, p1 -; CHECK-NEXT: mov p1, r16 ; CHECK-NEXT: vextract.32 r0, x0, #0, vaddsign1 ; CHECK-NEXT: vextract.32 r1, x0, #1, vaddsign1 -; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: vextract.32 r2, x0, #2, vaddsign1 -; CHECK-NEXT: st r0, [p0], #4 -; CHECK-NEXT: vextract.32 r3, x0, #3, vaddsign1 -; CHECK-NEXT: st r2, [p1, #8] -; CHECK-NEXT: st r1, [p0, #0] -; CHECK-NEXT: mov r30, p0 -; CHECK-NEXT: mov p0, p2 -; CHECK-NEXT: st r3, [p1, #12] -; CHECK-NEXT: st.s8 r0, [p0], #1 +; CHECK-NEXT: mov p0, r16 +; CHECK-NEXT: st r1, [p0, #4] +; CHECK-NEXT: st r0, [p0, #0] +; CHECK-NEXT: st.s8 r0, [p2, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r0, x4, #0, vaddsign1 +; CHECK-NEXT: vextract.32 r2, x0, #2, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r1, [p0, #0] +; CHECK-NEXT: st r2, [p0, #8] +; CHECK-NEXT: st.s8 r1, [p2, #1] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r1, x4, #1, vaddsign1 +; CHECK-NEXT: vextract.32 r3, x0, #3, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: nop +; CHECK-NEXT: st r3, [p0, #12] ; CHECK-NEXT: st.s8 r2, [p2, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -197,15 +192,6 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r16, x4, #8, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: st r9, [sp, #-608] // 4-byte Folded Spill -; CHECK-NEXT: st r10, [sp, #-612] // 4-byte Folded Spill -; CHECK-NEXT: st p7, [sp, #-640] // 4-byte Folded Spill -; CHECK-NEXT: st r11, [sp, #-616] // 4-byte Folded Spill -; CHECK-NEXT: st r12, [sp, #-620] // 4-byte Folded Spill -; CHECK-NEXT: st r13, [sp, #-624] // 4-byte Folded Spill -; CHECK-NEXT: st r14, [sp, #-628] // 4-byte Folded Spill -; CHECK-NEXT: st r15, [sp, #-632] // 4-byte Folded Spill ; CHECK-NEXT: mova dj0, #9 ; CHECK-NEXT: st.s8 r17, [p2, dj0] ; CHECK-NEXT: nop @@ -254,251 +240,241 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r23, x4, #15, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: mov r12, p0 -; CHECK-NEXT: mov p0, p4 -; CHECK-NEXT: st.s16 r0, [p0], #2 +; CHECK-NEXT: mov p3, sp +; CHECK-NEXT: padda [p3], #-512 +; CHECK-NEXT: st.s16 r0, [p3, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r0, x6, #0, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r1, [p0, #0] +; CHECK-NEXT: st.s16 r1, [p3, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r1, x6, #1, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r2, [p4, #4] +; CHECK-NEXT: st.s16 r2, [p3, #4] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r2, x6, #2, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r3, [p4, #6] +; CHECK-NEXT: st.s16 r3, [p3, #6] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r3, x6, #3, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r4, [p4, #8] +; CHECK-NEXT: st.s16 r4, [p3, #8] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r4, x6, #4, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r5, [p4, #10] +; CHECK-NEXT: st.s16 r5, [p3, #10] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r5, x6, #5, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: mov r9, p1 -; CHECK-NEXT: st.s16 r6, [p4, #12] -; CHECK-NEXT: vextract.16 r6, x6, #6, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: mov r13, p0 -; CHECK-NEXT: mov p0, r8 +; CHECK-NEXT: st.s16 r6, [p3, #12] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mov r29, p0 +; CHECK-NEXT: vextract.16 r6, x6, #6, vaddsign1 ; CHECK-NEXT: vextract.32 r0, x8, #0, vaddsign1 -; CHECK-NEXT: mov p1, p0 -; CHECK-NEXT: st r0, [p1], #4 -; CHECK-NEXT: st.s16 r7, [p4, #14] -; CHECK-NEXT: mova dj4, #32 -; CHECK-NEXT: mova dj5, #36 -; CHECK-NEXT: mova dj6, #40 +; CHECK-NEXT: mov p0, r30 +; CHECK-NEXT: st r0, [p0, #0] +; CHECK-NEXT: st.s16 r7, [p3, #14] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop ; CHECK-NEXT: vextract.16 r7, x6, #7, vaddsign1 ; CHECK-NEXT: vextract.32 r1, x8, #1, vaddsign1 ; CHECK-NEXT: mova dj0, #16 -; CHECK-NEXT: st r1, [p1, #0] -; CHECK-NEXT: st.s16 r16, [p4, dj0] -; CHECK-NEXT: mova dj7, #44 -; CHECK-NEXT: vmov x2, bmll0 -; CHECK-NEXT: mov p7, sp +; CHECK-NEXT: st r1, [p0, #4] +; CHECK-NEXT: st.s16 r16, [p3, dj0] +; CHECK-NEXT: nop +; CHECK-NEXT: mova dj4, #32 +; CHECK-NEXT: mova dj5, #36 ; CHECK-NEXT: vextract.16 r16, x6, #8, vaddsign1 -; CHECK-NEXT: vmov bmll0, x2 +; CHECK-NEXT: mova dj6, #40 ; CHECK-NEXT: mova dj0, #18 -; CHECK-NEXT: st.s16 r17, [p4, dj0] -; CHECK-NEXT: padda [p7], #-384 -; CHECK-NEXT: vmov x0, bmll0 +; CHECK-NEXT: st.s16 r17, [p3, dj0] +; CHECK-NEXT: vmov x2, bmll0 +; CHECK-NEXT: mov p1, sp ; CHECK-NEXT: vextract.16 r17, x6, #9, vaddsign1 ; CHECK-NEXT: vextract.32 r2, x8, #2, vaddsign1 ; CHECK-NEXT: vextract.32 r3, x8, #3, vaddsign1 ; CHECK-NEXT: mova dj0, #20 ; CHECK-NEXT: st r2, [p0, #8] ; CHECK-NEXT: st r3, [p0, #12] -; CHECK-NEXT: st.s16 r18, [p4, dj0] +; CHECK-NEXT: st.s16 r18, [p3, dj0] ; CHECK-NEXT: vmov bmll0, x2 +; CHECK-NEXT: padda [p1], #-448 +; CHECK-NEXT: vmov x0, bmll0 ; CHECK-NEXT: vextract.16 r18, x6, #10, vaddsign1 -; CHECK-NEXT: vextract.16 r19, x6, #11, vaddsign1 -; CHECK-NEXT: mova dj2, #60 -; CHECK-NEXT: mov r8, p7 +; CHECK-NEXT: vextract.64 r1:r0, x0, #0, vaddsign1 ; CHECK-NEXT: mova dj0, #22 -; CHECK-NEXT: st.s16 r19, [p4, dj0] -; CHECK-NEXT: vextract.16 r20, x6, #12, vaddsign1 -; CHECK-NEXT: mova dj1, #48 -; CHECK-NEXT: mov r10, p0 +; CHECK-NEXT: st r1, [p1, #4] +; CHECK-NEXT: st.s16 r19, [p3, dj0] +; CHECK-NEXT: mova dj7, #44 +; CHECK-NEXT: vmov bmll0, x2 +; CHECK-NEXT: vextract.16 r19, x6, #11, vaddsign1 ; CHECK-NEXT: vextract.32 r4, x8, #4, vaddsign1 ; CHECK-NEXT: vextract.32 r5, x8, #5, vaddsign1 ; CHECK-NEXT: mova dj0, #24 ; CHECK-NEXT: st r4, [p0, #16] ; CHECK-NEXT: st r5, [p0, #20] -; CHECK-NEXT: st.s16 r20, [p4, dj0] -; CHECK-NEXT: vextract.16 r21, x6, #13, vaddsign1 -; CHECK-NEXT: vextract.16 r22, x6, #14, vaddsign1 -; CHECK-NEXT: vextract.16 r23, x6, #15, vaddsign1 -; CHECK-NEXT: mova dj3, #52 -; CHECK-NEXT: mov r11, p1 -; CHECK-NEXT: mova dj0, #26 -; CHECK-NEXT: st.s16 r21, [p4, dj0] -; CHECK-NEXT: vextract.64 r1:r0, x0, #0, vaddsign1 +; CHECK-NEXT: st.s16 r20, [p3, dj0] +; CHECK-NEXT: mov r30, p0 +; CHECK-NEXT: mova dj2, #60 +; CHECK-NEXT: vextract.16 r20, x6, #12, vaddsign1 ; CHECK-NEXT: vmov x0, bmll0 -; CHECK-NEXT: vmov bmll0, x2 +; CHECK-NEXT: vextract.64 r3:r2, x0, #1, vaddsign1 +; CHECK-NEXT: mova dj0, #26 +; CHECK-NEXT: st r2, [p1, #8] +; CHECK-NEXT: st r3, [p1, #12] +; CHECK-NEXT: st.s16 r21, [p3, dj0] +; CHECK-NEXT: mova dj1, #48 +; CHECK-NEXT: mova dj3, #52 +; CHECK-NEXT: vextract.16 r21, x6, #13, vaddsign1 ; CHECK-NEXT: vextract.32 r6, x8, #6, vaddsign1 ; CHECK-NEXT: vextract.32 r7, x8, #7, vaddsign1 ; CHECK-NEXT: mova dj0, #28 ; CHECK-NEXT: st r6, [p0, #24] ; CHECK-NEXT: st r7, [p0, #28] -; CHECK-NEXT: st.s16 r22, [p4, dj0] -; CHECK-NEXT: vextract.64 r3:r2, x0, #1, vaddsign1 +; CHECK-NEXT: st.s16 r22, [p3, dj0] +; CHECK-NEXT: vmov bmll0, x2 +; CHECK-NEXT: vextract.16 r22, x6, #14, vaddsign1 ; CHECK-NEXT: vmov x0, bmll0 ; CHECK-NEXT: vmov bmll0, x2 ; CHECK-NEXT: vextract.64 r5:r4, x0, #2, vaddsign1 -; CHECK-NEXT: vmov x0, bmll0 ; CHECK-NEXT: mova dj0, #30 -; CHECK-NEXT: st.s16 r23, [p4, dj0] -; CHECK-NEXT: mov p0, r25 +; CHECK-NEXT: st r4, [p1, #16] +; CHECK-NEXT: st r5, [p1, #20] +; CHECK-NEXT: st.s16 r23, [p3, dj0] +; CHECK-NEXT: vextract.16 r23, x6, #15, vaddsign1 +; CHECK-NEXT: vmov x0, bmll0 +; CHECK-NEXT: mov p0, p1 ; CHECK-NEXT: vextract.64 r7:r6, x0, #3, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: mov p1, p0 -; CHECK-NEXT: mov r15, p0 ; CHECK-NEXT: mova dj0, #36 -; CHECK-NEXT: st r1, [p0, #4] -; CHECK-NEXT: st r3, [p0, #12] -; CHECK-NEXT: st r4, [p0, #16] -; CHECK-NEXT: st r5, [p0, #20] -; CHECK-NEXT: st r6, [p0, #24] -; CHECK-NEXT: st r7, [p0, #28] -; CHECK-NEXT: mov p0, r24 -; CHECK-NEXT: st r0, [p1], #8 -; CHECK-NEXT: vextract.64 r7:r6, x0, #0, vaddsign1 +; CHECK-NEXT: st r0, [p0], #12 +; CHECK-NEXT: vextract.64 r5:r4, x0, #0, vaddsign1 +; CHECK-NEXT: vmov x0, bmll1 +; CHECK-NEXT: mov r31, p0 +; CHECK-NEXT: mov p0, r9 +; CHECK-NEXT: st r6, [p1, #24] +; CHECK-NEXT: st r7, [p1, #28] +; CHECK-NEXT: vextract.64 r7:r6, x0, #1, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: mov r14, p1 -; CHECK-NEXT: st r2, [p1, #0] -; CHECK-NEXT: mov p1, p0 -; CHECK-NEXT: vextract.64 r17:r16, x0, #1, vaddsign1 +; CHECK-NEXT: mov r9, p0 +; CHECK-NEXT: vextract.64 r17:r16, x0, #2, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: vextract.64 r19:r18, x0, #2, vaddsign1 +; CHECK-NEXT: st r4, [p0, #0] +; CHECK-NEXT: st r5, [p0, #4] +; CHECK-NEXT: vextract.32 r4, x10, #4, vaddsign1 +; CHECK-NEXT: vextract.32 r5, x10, #5, vaddsign1 +; CHECK-NEXT: vextract.64 r19:r18, x0, #3, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r6, [p1], #8 +; CHECK-NEXT: st r6, [p0, #8] +; CHECK-NEXT: st r7, [p0, #12] ; CHECK-NEXT: vextract.32 r6, x10, #6, vaddsign1 ; CHECK-NEXT: vextract.32 r7, x10, #7, vaddsign1 -; CHECK-NEXT: st r7, [p0, #4] -; CHECK-NEXT: mov r24, p1 -; CHECK-NEXT: st r16, [p1, #0] -; CHECK-NEXT: mov p1, p0 -; CHECK-NEXT: vextract.64 r21:r20, x0, #3, vaddsign1 +; CHECK-NEXT: vextract.64 r21:r20, x0, #4, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r17, [p0, #12] +; CHECK-NEXT: st r16, [p0, #16] +; CHECK-NEXT: st r17, [p0, #20] ; CHECK-NEXT: vextract.32 r16, x10, #8, vaddsign1 ; CHECK-NEXT: vextract.32 r17, x10, #9, vaddsign1 -; CHECK-NEXT: vextract.64 r23:r22, x0, #4, vaddsign1 +; CHECK-NEXT: st r4, [p6, #16] +; CHECK-NEXT: st r5, [p6, #20] +; CHECK-NEXT: vextract.64 r23:r22, x0, #5, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r18, [p0, #16] -; CHECK-NEXT: st r19, [p0, #20] +; CHECK-NEXT: st r18, [p0, #24] +; CHECK-NEXT: st r19, [p0, #28] ; CHECK-NEXT: vextract.32 r18, x10, #10, vaddsign1 ; CHECK-NEXT: vextract.32 r19, x10, #11, vaddsign1 -; CHECK-NEXT: st r6, [p7, #24] -; CHECK-NEXT: st r7, [p7, #28] -; CHECK-NEXT: st r17, [p7, dj5] +; CHECK-NEXT: st r6, [p6, #24] +; CHECK-NEXT: st r7, [p6, #28] +; CHECK-NEXT: st r21, [p0, dj0] +; CHECK-NEXT: mova dj0, #56 +; CHECK-NEXT: st r17, [p6, dj5] ; CHECK-NEXT: mova dj5, #8 -; CHECK-NEXT: vextract.64 r5:r4, x0, #5, vaddsign1 +; CHECK-NEXT: vextract.64 r3:r2, x0, #6, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r20, [p0, #24] -; CHECK-NEXT: st r21, [p0, #28] +; CHECK-NEXT: st r20, [p0, dj4] ; CHECK-NEXT: vextract.32 r20, x10, #12, vaddsign1 ; CHECK-NEXT: vextract.32 r21, x10, #13, vaddsign1 -; CHECK-NEXT: st r16, [p7, dj4] -; CHECK-NEXT: st r23, [p0, dj0] -; CHECK-NEXT: mova dj0, #56 +; CHECK-NEXT: st r16, [p6, dj4] ; CHECK-NEXT: lda.s8 r16, [p2, dj5] ; CHECK-NEXT: mova dj5, #9 -; CHECK-NEXT: vextract.64 r3:r2, x0, #6, vaddsign1 -; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r22, [p0, dj4] +; CHECK-NEXT: vextract.64 r1:r0, x0, #7, vaddsign1 +; CHECK-NEXT: st r22, [p0, dj6] +; CHECK-NEXT: st r23, [p0, dj7] ; CHECK-NEXT: vextract.32 r22, x10, #14, vaddsign1 ; CHECK-NEXT: vextract.32 r23, x10, #15, vaddsign1 -; CHECK-NEXT: st r18, [p7, dj6] -; CHECK-NEXT: st r19, [p7, dj7] +; CHECK-NEXT: st r18, [p6, dj6] +; CHECK-NEXT: st r19, [p6, dj7] ; CHECK-NEXT: lda.s8 r17, [p2, dj5] ; CHECK-NEXT: mova dj5, #10 -; CHECK-NEXT: vextract.64 r1:r0, x0, #7, vaddsign1 -; CHECK-NEXT: st r4, [p0, dj6] -; CHECK-NEXT: st r5, [p0, dj7] -; CHECK-NEXT: vextract.32 r4, x10, #4, vaddsign1 -; CHECK-NEXT: vextract.32 r5, x10, #5, vaddsign1 -; CHECK-NEXT: st r20, [p7, dj1] -; CHECK-NEXT: st r21, [p7, dj3] -; CHECK-NEXT: lda.s8 r18, [p2, dj5] -; CHECK-NEXT: mova dj5, #11 ; CHECK-NEXT: st r2, [p0, dj1] ; CHECK-NEXT: st r3, [p0, dj3] ; CHECK-NEXT: vextract.32 r2, x10, #2, vaddsign1 ; CHECK-NEXT: vextract.32 r3, x10, #3, vaddsign1 -; CHECK-NEXT: st r22, [p7, dj0] -; CHECK-NEXT: st r23, [p7, dj2] +; CHECK-NEXT: st r20, [p6, dj1] +; CHECK-NEXT: st r21, [p6, dj3] ; CHECK-NEXT: st r0, [p0, dj0] ; CHECK-NEXT: st r1, [p0, dj2] -; CHECK-NEXT: mov p0, p7 -; CHECK-NEXT: lda.s8 r19, [p2, dj5] -; CHECK-NEXT: mova dj5, #12 +; CHECK-NEXT: mov p0, r28 +; CHECK-NEXT: lda.s8 r18, [p2, dj5] +; CHECK-NEXT: mova dj5, #11 ; CHECK-NEXT: vextract.32 r0, x10, #0, vaddsign1 ; CHECK-NEXT: vextract.32 r1, x10, #1, vaddsign1 -; CHECK-NEXT: st r4, [p7, #16] -; CHECK-NEXT: st r5, [p7, #20] -; CHECK-NEXT: st r2, [p7, #8] -; CHECK-NEXT: st r3, [p7, #12] -; CHECK-NEXT: mov p7, r31 -; CHECK-NEXT: lda.s8 r20, [p2, dj5] -; CHECK-NEXT: mova dj5, #13 -; CHECK-NEXT: lda.s8 r21, [p2, dj5] -; CHECK-NEXT: mova dj5, #14 -; CHECK-NEXT: st r0, [p0], #4 -; CHECK-NEXT: mov r25, p0 -; CHECK-NEXT: st r1, [p0, #0] -; CHECK-NEXT: mov p0, r28 -; CHECK-NEXT: lda.s16 r1, [p7, #0] -; CHECK-NEXT: mov p7, r9 -; CHECK-NEXT: lda.s8 r22, [p2, dj5] -; CHECK-NEXT: mova dj5, #15 -; CHECK-NEXT: lda r9, [sp, #-608] // 4-byte Folded Reload -; CHECK-NEXT: lda.s16 r0, [p0, #0] +; CHECK-NEXT: st r22, [p6, dj0] +; CHECK-NEXT: st r23, [p6, dj2] +; CHECK-NEXT: lda.s8 r19, [p2, dj5] +; CHECK-NEXT: mova dj5, #12 +; CHECK-NEXT: st r2, [p6, #8] +; CHECK-NEXT: st r3, [p6, #12] ; CHECK-NEXT: lda.s16 r2, [p0, #4] ; CHECK-NEXT: lda.s16 r3, [p0, #6] ; CHECK-NEXT: lda.s16 r4, [p0, #8] ; CHECK-NEXT: lda.s16 r5, [p0, #10] ; CHECK-NEXT: lda.s16 r6, [p0, #12] ; CHECK-NEXT: lda.s16 r7, [p0, #14] +; CHECK-NEXT: st r0, [p6, #0] +; CHECK-NEXT: st r1, [p6, #4] +; CHECK-NEXT: lda.s16 r0, [p0, #0] +; CHECK-NEXT: lda.s16 r1, [p0, #2] ; CHECK-NEXT: mov p0, r27 -; CHECK-NEXT: lda.s8 r23, [p2, dj5] +; CHECK-NEXT: lda.s8 r20, [p2, dj5] +; CHECK-NEXT: mova dj5, #13 +; CHECK-NEXT: lda.s8 r21, [p2, dj5] ; CHECK-NEXT: st.s16 r0, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: lda r0, [p7, #0] +; CHECK-NEXT: mova dj5, #14 +; CHECK-NEXT: lda.s8 r22, [p2, dj5] ; CHECK-NEXT: st.s16 r1, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop +; CHECK-NEXT: mova dj5, #15 +; CHECK-NEXT: lda.s8 r23, [p2, dj5] ; CHECK-NEXT: st.s16 r2, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -506,7 +482,6 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r2, [p7, #8] ; CHECK-NEXT: st.s16 r3, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -514,8 +489,6 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r3, [p7, #12] -; CHECK-NEXT: lda p7, [sp, #-640] // 4-byte Folded Reload ; CHECK-NEXT: st.s16 r4, [p0], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -532,173 +505,170 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: lda.s8 r5, [p2, #5] -; CHECK-NEXT: st.s16 r6, [p0], #2 +; CHECK-NEXT: st.s16 r6, [p0, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s8 r6, [p2, #6] -; CHECK-NEXT: st.s16 r7, [p0, #0] +; CHECK-NEXT: st.s16 r7, [p0, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: mov p0, r30 +; CHECK-NEXT: mov p4, sp +; CHECK-NEXT: padda [p4], #-256 +; CHECK-NEXT: mov p0, r29 +; CHECK-NEXT: lda r0, [p0, #0] +; CHECK-NEXT: lda.s8 r6, [p2, #6] +; CHECK-NEXT: lda r1, [p0, #4] ; CHECK-NEXT: lda.s8 r7, [p2, #7] -; CHECK-NEXT: lda r1, [p0, #0] +; CHECK-NEXT: lda r2, [p0, #8] +; CHECK-NEXT: lda r3, [p0, #12] ; CHECK-NEXT: mov p0, r26 ; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: lda.s8 r0, [p2, #0] -; CHECK-NEXT: st.s8 r0, [p3], #1 +; CHECK-NEXT: st r1, [p0], #4 +; CHECK-NEXT: lda.s8 r1, [p2, #1] +; CHECK-NEXT: st.s8 r0, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r0, [p4, #0] -; CHECK-NEXT: st r1, [p0], #4 -; CHECK-NEXT: st r2, [p0], #4 +; CHECK-NEXT: lda.s16 r0, [p3, #0] +; CHECK-NEXT: st r2, [p0, #0] +; CHECK-NEXT: st r3, [p0, #4] ; CHECK-NEXT: lda.s8 r2, [p2, #2] -; CHECK-NEXT: st r3, [p0, #0] ; CHECK-NEXT: lda.s8 r3, [p2, #3] -; CHECK-NEXT: mov p0, r12 -; CHECK-NEXT: lda r12, [sp, #-620] // 4-byte Folded Reload -; CHECK-NEXT: lda r11, [sp, #-616] // 4-byte Folded Reload -; CHECK-NEXT: lda.s8 r1, [p0, #0] -; CHECK-NEXT: lda r13, [sp, #-624] // 4-byte Folded Reload -; CHECK-NEXT: st.s8 r1, [p3], #1 +; CHECK-NEXT: st.s8 r1, [p4], #1 +; CHECK-NEXT: nop +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p2, r11 ; CHECK-NEXT: nop -; CHECK-NEXT: mov p0, r13 ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r1, [p0, #0] -; CHECK-NEXT: lda r10, [sp, #-612] // 4-byte Folded Reload -; CHECK-NEXT: st.s8 r2, [p3], #1 +; CHECK-NEXT: lda.s16 r1, [p3, #2] +; CHECK-NEXT: st.s8 r2, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p0, r10 ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r2, [p4, #4] -; CHECK-NEXT: st.s8 r3, [p3], #1 ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r2, [p3, #4] +; CHECK-NEXT: st.s8 r3, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r3, [p4, #6] -; CHECK-NEXT: st.s8 r4, [p3], #1 ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r3, [p3, #6] +; CHECK-NEXT: st.s8 r4, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r4, [p4, #8] -; CHECK-NEXT: st.s8 r5, [p3], #1 ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r4, [p3, #8] +; CHECK-NEXT: st.s8 r5, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r5, [p4, #10] -; CHECK-NEXT: st.s8 r6, [p3], #1 ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r5, [p3, #10] +; CHECK-NEXT: st.s8 r6, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r6, [p4, #12] -; CHECK-NEXT: st.s8 r7, [p3], #1 ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r6, [p3, #12] +; CHECK-NEXT: st.s8 r7, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda.s16 r7, [p4, #14] -; CHECK-NEXT: st.s8 r16, [p3], #1 +; CHECK-NEXT: nop +; CHECK-NEXT: lda.s16 r7, [p3, #14] +; CHECK-NEXT: st.s8 r16, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj5, #16 -; CHECK-NEXT: lda.s16 r16, [p4, dj5] -; CHECK-NEXT: st.s8 r17, [p3], #1 +; CHECK-NEXT: lda.s16 r16, [p3, dj5] +; CHECK-NEXT: st.s8 r17, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj5, #18 -; CHECK-NEXT: lda.s16 r17, [p4, dj5] -; CHECK-NEXT: st.s8 r18, [p3], #1 +; CHECK-NEXT: lda.s16 r17, [p3, dj5] +; CHECK-NEXT: st.s8 r18, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj5, #20 -; CHECK-NEXT: lda.s16 r18, [p4, dj5] -; CHECK-NEXT: st.s8 r19, [p3], #1 +; CHECK-NEXT: lda.s16 r18, [p3, dj5] +; CHECK-NEXT: st.s8 r19, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj5, #22 -; CHECK-NEXT: lda.s16 r19, [p4, dj5] -; CHECK-NEXT: st.s8 r20, [p3], #1 +; CHECK-NEXT: lda.s16 r19, [p3, dj5] +; CHECK-NEXT: st.s8 r20, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj5, #24 -; CHECK-NEXT: lda.s16 r20, [p4, dj5] -; CHECK-NEXT: st.s8 r21, [p3], #1 +; CHECK-NEXT: lda.s16 r20, [p3, dj5] +; CHECK-NEXT: st.s8 r21, [p4], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj5, #26 -; CHECK-NEXT: lda.s16 r21, [p4, dj5] -; CHECK-NEXT: st.s8 r22, [p3], #1 +; CHECK-NEXT: lda.s16 r21, [p3, dj5] +; CHECK-NEXT: st.s8 r22, [p4, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj5, #28 -; CHECK-NEXT: lda.s16 r22, [p4, dj5] -; CHECK-NEXT: st.s8 r23, [p3, #0] +; CHECK-NEXT: lda.s16 r22, [p3, dj5] +; CHECK-NEXT: st.s8 r23, [p4, #1] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj5, #30 -; CHECK-NEXT: lda.s16 r23, [p4, dj5] -; CHECK-NEXT: lda r15, [sp, #-632] // 4-byte Folded Reload +; CHECK-NEXT: lda.s16 r23, [p3, dj5] ; CHECK-NEXT: st.s16 r0, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p3, r15 ; CHECK-NEXT: nop +; CHECK-NEXT: mov p0, r30 ; CHECK-NEXT: lda r0, [p0, #0] ; CHECK-NEXT: st.s16 r1, [p5], #2 ; CHECK-NEXT: nop @@ -707,14 +677,13 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r1, [p2, #0] -; CHECK-NEXT: lda r14, [sp, #-628] // 4-byte Folded Reload +; CHECK-NEXT: lda r1, [p0, #4] ; CHECK-NEXT: st.s16 r2, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p2, r14 +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: lda r2, [p0, #8] ; CHECK-NEXT: st.s16 r3, [p5], #2 @@ -757,8 +726,9 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: lda r7, [p0, #28] -; CHECK-NEXT: mov p0, r29 +; CHECK-NEXT: mov p0, r25 ; CHECK-NEXT: st r0, [p0], #4 +; CHECK-NEXT: lda r0, [p1, #0] ; CHECK-NEXT: st.s16 r16, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -766,9 +736,8 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r16, [p1, dj4] ; CHECK-NEXT: st r1, [p0], #4 -; CHECK-NEXT: lda r1, [p3, #4] +; CHECK-NEXT: lda r1, [p1, #4] ; CHECK-NEXT: st.s16 r17, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -777,7 +746,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r2, [p0], #4 -; CHECK-NEXT: lda r2, [p2, #0] +; CHECK-NEXT: lda r2, [p1, #8] ; CHECK-NEXT: st.s16 r18, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -785,7 +754,6 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r18, [p1, dj6] ; CHECK-NEXT: st r3, [p0], #4 ; CHECK-NEXT: st.s16 r19, [p5], #2 ; CHECK-NEXT: nop @@ -795,7 +763,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r4, [p0], #4 -; CHECK-NEXT: lda r4, [p3, #16] +; CHECK-NEXT: lda r4, [p1, #16] ; CHECK-NEXT: st.s16 r20, [p5], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -811,98 +779,99 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r6, [p0], #4 -; CHECK-NEXT: st.s16 r22, [p5], #2 +; CHECK-NEXT: st r6, [p0, #0] +; CHECK-NEXT: st r7, [p0, #4] +; CHECK-NEXT: lda r6, [p1, #24] +; CHECK-NEXT: lda r9, [sp, #-568] // 4-byte Folded Reload +; CHECK-NEXT: st.s16 r22, [p5, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: mov p1, r9 +; CHECK-NEXT: mov p0, r31 +; CHECK-NEXT: lda r3, [p0], #8 +; CHECK-NEXT: lda r16, [p1, dj4] +; CHECK-NEXT: lda r18, [p1, dj6] +; CHECK-NEXT: lda r20, [p1, dj1] +; CHECK-NEXT: lda r22, [p1, dj0] +; CHECK-NEXT: st.s16 r23, [p5, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st r7, [p0, #0] -; CHECK-NEXT: st.s16 r23, [p5, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p2, r24 -; CHECK-NEXT: mov p0, p3 -; CHECK-NEXT: lda r0, [p0], #12 -; CHECK-NEXT: lda p0, [sp, #-44] // 4-byte Folded Reload -; CHECK-NEXT: lda r20, [p1, dj1] -; CHECK-NEXT: lda r6, [p3, #24] -; CHECK-NEXT: lda r22, [p1, dj0] -; CHECK-NEXT: lda r3, [p0], #8 -; CHECK-NEXT: lda r5, [p0], #8 -; CHECK-NEXT: lda r7, [p0, #0] +; CHECK-NEXT: lda r5, [p0, #0] +; CHECK-NEXT: lda r7, [p0, #8] +; CHECK-NEXT: mov p0, r24 ; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: st r1, [p0], #4 ; CHECK-NEXT: lda r1, [p1, #4] ; CHECK-NEXT: st r2, [p0], #4 -; CHECK-NEXT: lda r2, [p2, #0] +; CHECK-NEXT: lda r2, [p1, #8] ; CHECK-NEXT: st r3, [p0], #4 ; CHECK-NEXT: st r4, [p0], #4 ; CHECK-NEXT: lda r4, [p1, #16] ; CHECK-NEXT: st r5, [p0], #4 -; CHECK-NEXT: st r6, [p0], #4 -; CHECK-NEXT: lda r6, [p1, #24] -; CHECK-NEXT: st r7, [p0, #0] +; CHECK-NEXT: st r6, [p0, #0] +; CHECK-NEXT: st r7, [p0, #4] ; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: mov p1, r25 +; CHECK-NEXT: lda r6, [p1, #24] ; CHECK-NEXT: lda r0, [p0], #12 ; CHECK-NEXT: lda r3, [p0], #8 ; CHECK-NEXT: lda r5, [p0], #8 ; CHECK-NEXT: lda r7, [p0], #8 ; CHECK-NEXT: lda r17, [p0], #8 ; CHECK-NEXT: lda r19, [p0], #8 -; CHECK-NEXT: lda r21, [p0], #8 -; CHECK-NEXT: lda r23, [p0, #0] +; CHECK-NEXT: lda r21, [p0, #0] +; CHECK-NEXT: lda r23, [p0, #8] ; CHECK-NEXT: mov p0, r8 -; CHECK-NEXT: st r0, [p6], #4 -; CHECK-NEXT: lda r8, [sp, #-604] // 4-byte Folded Reload -; CHECK-NEXT: st r1, [p6], #4 -; CHECK-NEXT: lda r0, [p0, #0] -; CHECK-NEXT: lda r1, [p1, #0] -; CHECK-NEXT: st r2, [p6], #4 -; CHECK-NEXT: lda r2, [p0, #8] -; CHECK-NEXT: st r3, [p6], #4 -; CHECK-NEXT: lda r3, [p0, #12] -; CHECK-NEXT: st r4, [p6], #4 -; CHECK-NEXT: lda r4, [p0, #16] -; CHECK-NEXT: st r5, [p6], #4 -; CHECK-NEXT: lda r5, [p0, #20] -; CHECK-NEXT: st r6, [p6], #4 -; CHECK-NEXT: lda r6, [p0, #24] -; CHECK-NEXT: st r7, [p6], #4 -; CHECK-NEXT: lda r7, [p0, #28] -; CHECK-NEXT: st r16, [p6], #4 -; CHECK-NEXT: lda r16, [p0, dj4] -; CHECK-NEXT: mova dj4, #36 -; CHECK-NEXT: st r17, [p6], #4 -; CHECK-NEXT: lda r17, [p0, dj4] -; CHECK-NEXT: st r18, [p6], #4 -; CHECK-NEXT: lda r18, [p0, dj6] -; CHECK-NEXT: st r19, [p6], #4 -; CHECK-NEXT: lda r19, [p0, dj7] -; CHECK-NEXT: st r20, [p6], #4 -; CHECK-NEXT: lda r20, [p0, dj1] -; CHECK-NEXT: st r21, [p6], #4 -; CHECK-NEXT: lda r21, [p0, dj3] -; CHECK-NEXT: lda p0, [sp, #-48] // 4-byte Folded Reload -; CHECK-NEXT: st r22, [p6], #4 -; CHECK-NEXT: lda r22, [p0, dj0] -; CHECK-NEXT: st r23, [p6, #0] -; CHECK-NEXT: lda r23, [p0, dj2] -; CHECK-NEXT: lda p6, [sp, #-636] // 4-byte Folded Reload -; CHECK-NEXT: paddxm [sp], #-640 +; CHECK-NEXT: lda r8, [sp, #-564] // 4-byte Folded Reload +; CHECK-NEXT: st r0, [p7], #4 +; CHECK-NEXT: lda r0, [p6, #0] +; CHECK-NEXT: st r1, [p7], #4 +; CHECK-NEXT: lda r1, [p6, #4] +; CHECK-NEXT: st r2, [p7], #4 +; CHECK-NEXT: lda r2, [p6, #8] +; CHECK-NEXT: st r3, [p7], #4 +; CHECK-NEXT: lda r3, [p6, #12] +; CHECK-NEXT: st r4, [p7], #4 +; CHECK-NEXT: lda r4, [p6, #16] +; CHECK-NEXT: st r5, [p7], #4 +; CHECK-NEXT: lda r5, [p6, #20] +; CHECK-NEXT: st r6, [p7], #4 +; CHECK-NEXT: lda r6, [p6, #24] +; CHECK-NEXT: st r7, [p7], #4 ; CHECK-NEXT: st r0, [p0], #4 +; CHECK-NEXT: lda r7, [p6, #28] +; CHECK-NEXT: st r16, [p7], #4 +; CHECK-NEXT: lda r16, [p6, dj4] +; CHECK-NEXT: mova dj4, #36 ; CHECK-NEXT: st r1, [p0], #4 +; CHECK-NEXT: st r17, [p7], #4 ; CHECK-NEXT: st r2, [p0], #4 +; CHECK-NEXT: lda r17, [p6, dj4] +; CHECK-NEXT: st r18, [p7], #4 ; CHECK-NEXT: st r3, [p0], #4 +; CHECK-NEXT: lda r18, [p6, dj6] +; CHECK-NEXT: st r19, [p7], #4 ; CHECK-NEXT: st r4, [p0], #4 +; CHECK-NEXT: lda r19, [p6, dj7] +; CHECK-NEXT: st r20, [p7], #4 ; CHECK-NEXT: st r5, [p0], #4 +; CHECK-NEXT: lda r20, [p6, dj1] +; CHECK-NEXT: st r21, [p7], #4 ; CHECK-NEXT: st r6, [p0], #4 +; CHECK-NEXT: lda r21, [p6, dj3] +; CHECK-NEXT: st r22, [p7, #0] +; CHECK-NEXT: st r23, [p7, #4] ; CHECK-NEXT: st r7, [p0], #4 +; CHECK-NEXT: lda r22, [p6, dj0] +; CHECK-NEXT: lda r23, [p6, dj2] +; CHECK-NEXT: lda p7, [sp, #-576] // 4-byte Folded Reload +; CHECK-NEXT: lda p6, [sp, #-572] // 4-byte Folded Reload +; CHECK-NEXT: paddxm [sp], #-576 ; CHECK-NEXT: st r16, [p0], #4 ; CHECK-NEXT: st r17, [p0], #4 ; CHECK-NEXT: st r18, [p0], #4 @@ -910,8 +879,8 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r20, [p0], #4 // Delay Slot 5 ; CHECK-NEXT: st r21, [p0], #4 // Delay Slot 4 -; CHECK-NEXT: st r22, [p0], #4 // Delay Slot 3 -; CHECK-NEXT: st r23, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: st r22, [p0, #0] // Delay Slot 3 +; CHECK-NEXT: st r23, [p0, #4] // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %a.addr = alloca <8 x i16>, align 8 diff --git a/llvm/test/CodeGen/AIE/dyn-stackalloc.ll b/llvm/test/CodeGen/AIE/dyn-stackalloc.ll index 783f65587ec4..b74e714a35b9 100644 --- a/llvm/test/CodeGen/AIE/dyn-stackalloc.ll +++ b/llvm/test/CodeGen/AIE/dyn-stackalloc.ll @@ -246,36 +246,37 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; AIE2-NEXT: padda [p7], m0 ; AIE2-NEXT: movxm m0, #-40048 ; AIE2-NEXT: mov p0, p7 +; AIE2-NEXT: mov p3, p7 ; AIE2-NEXT: mov p2, p7 ; AIE2-NEXT: mov p6, p7 +; AIE2-NEXT: paddb [p3], #-32 ; AIE2-NEXT: paddb [p0], m0 -; AIE2-NEXT: paddb [p6], #-32 -; AIE2-NEXT: movxm m0, #-40032 ; AIE2-NEXT: st r0, [p0, #0] ; AIE2-NEXT: lda r0, [p0, #0] -; AIE2-NEXT: paddb [p2], m0 +; AIE2-NEXT: paddb [p2], #-24 +; AIE2-NEXT: mov r16, p3 +; AIE2-NEXT: st r1, [p2, #4] ; AIE2-NEXT: mov p0, sp -; AIE2-NEXT: mov r16, p2 -; AIE2-NEXT: st p0, [p6, #0] -; AIE2-NEXT: mov p0, p7 -; AIE2-NEXT: paddb [p0], #-24 +; AIE2-NEXT: st p0, [p3, #0] +; AIE2-NEXT: mov p0, p1 ; AIE2-NEXT: lshl r2, r0, r2 -; AIE2-NEXT: st r0, [p0], #4 +; AIE2-NEXT: st r0, [p2, #0] ; AIE2-NEXT: add r2, r2, #31 -; AIE2-NEXT: st r1, [p0, #0] +; AIE2-NEXT: and r2, r2, r3 ; AIE2-NEXT: jl #extern_call -; AIE2-NEXT: mov p0, p1 // Delay Slot 5 -; AIE2-NEXT: and r2, r2, r3 // Delay Slot 4 -; AIE2-NEXT: mov m0, r2 // Delay Slot 3 -; AIE2-NEXT: paddb [p1], m0 // Delay Slot 2 +; AIE2-NEXT: mov m0, r2 // Delay Slot 5 +; AIE2-NEXT: paddb [p1], m0 // Delay Slot 4 +; AIE2-NEXT: movxm m0, #-40032 // Delay Slot 3 +; AIE2-NEXT: paddb [p6], m0 // Delay Slot 2 ; AIE2-NEXT: mov sp, p1 // Delay Slot 1 ; AIE2-NEXT: nopb ; nopa ; nops ; jl #extern_call; nopv ; AIE2-NEXT: nopa ; nopx // Delay Slot 5 ; AIE2-NEXT: nop // Delay Slot 4 ; AIE2-NEXT: nop // Delay Slot 3 ; AIE2-NEXT: nop // Delay Slot 2 -; AIE2-NEXT: mov p0, r16 // Delay Slot 1 -; AIE2-NEXT: lda p0, [p6, #0]; nopx +; AIE2-NEXT: mov p0, p6 // Delay Slot 1 +; AIE2-NEXT: nopb ; nopa ; nops ; nopx ; mov p0, r16; nopv +; AIE2-NEXT: lda p0, [p0, #0]; nopx ; AIE2-NEXT: nop ; AIE2-NEXT: nop ; AIE2-NEXT: nop @@ -329,18 +330,18 @@ define void @test_huge_stack(i32 noundef %n) #0 { ; AIE2P-NEXT: padda [p0], m0 ; AIE2P-NEXT: mova m0, #-32 ; AIE2P-NEXT: padda [p3], m0 +; AIE2P-NEXT: mova m0, #-24 ; AIE2P-NEXT: st r0, [p0, #0] ; AIE2P-NEXT: lda r0, [p0, #0] -; AIE2P-NEXT: mova m0, #-24 ; AIE2P-NEXT: mov p0, sp ; AIE2P-NEXT: mov r8, p3 ; AIE2P-NEXT: padda [p2], m0 ; AIE2P-NEXT: st p0, [p3, #0] ; AIE2P-NEXT: mov p0, p1 +; AIE2P-NEXT: st r1, [p2, #4] ; AIE2P-NEXT: lshl r2, r0, r2 -; AIE2P-NEXT: st r0, [p2], #4 +; AIE2P-NEXT: st r0, [p2, #0] ; AIE2P-NEXT: add r2, r2, #63 -; AIE2P-NEXT: st r1, [p2, #0] ; AIE2P-NEXT: and r2, r2, r3 ; AIE2P-NEXT: jl #extern_call ; AIE2P-NEXT: mov m0, r2 // Delay Slot 5 From e8a8c6203a061161d3455de667346a518018f980 Mon Sep 17 00:00:00 2001 From: Fabian Stuckmann Date: Tue, 10 Jun 2025 03:34:34 -0600 Subject: [PATCH 6/6] [AIEX] Reorder PostInc and PreInc Memory Instr --- llvm/lib/Target/AIE/AIEGlobalCombiner.cpp | 44 ++- llvm/lib/Target/AIE/AIEGlobalCombiner.h | 15 +- .../Target/AIE/AIEGlobalCombinerPtrMods.cpp | 74 ++++- .../lib/Target/AIE/AIEGlobalCombinerPtrMods.h | 18 ++ llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp | 3 + .../GlobalIsel/global-combiners/gemm.mir | 2 +- .../global-combiners/post-inc-eagerness.mir | 4 +- .../global-combiners/reorder-Mem-Instrs.mir | 2 +- .../global-combiners/user-intrinsics.mir | 4 +- .../CodeGen/AIE/aie2p/load-store-unaligned.ll | 276 +++++++++--------- 10 files changed, 290 insertions(+), 152 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp b/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp index 5d8cec05d34f..3d47e86b9364 100644 --- a/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp +++ b/llvm/lib/Target/AIE/AIEGlobalCombiner.cpp @@ -69,9 +69,12 @@ AIEGlobalCombiner::findBeneficialCombiners() { CombineCandidates.filterOut(FixedCombiners); CombineCandidates.filterOut(FoundCombiners); - for (auto *Combiner : - CombineCandidates.searchCombinerSet(OwnedCombineCandidates)) - FoundCombiners.push_back(Combiner); + auto Combiners = + CombineCandidates.searchCombinerSet(OwnedCombineCandidates); + reorderCombinerInsertions(Combiners); + + FoundCombiners.insert(FoundCombiners.end(), Combiners.begin(), + Combiners.end()); } LLVM_DEBUG(dbgs() << "[Global Combiner] Found " << FoundCombiners.size() @@ -80,6 +83,27 @@ AIEGlobalCombiner::findBeneficialCombiners() { return FoundCombiners; } +void AIEGlobalCombiner::reorderCombinerInsertions( + std::vector &Combiners) const { + for (auto *Combiner : Combiners) { + if (!Combiner->canReorder()) + continue; + + auto It = std::find_if(Combiners.rbegin(), Combiners.rend(), + [Combiner](GenericCombiner *Candidate) { + return Combiner->isReorderCandidate(Candidate); + }); + + if (It == Combiners.rend()) + continue; + + auto *BestCandidate = *It; + Combiner->copyInsertionPoint(BestCandidate); + LLVM_DEBUG(dbgs() << "Reordering \n"; Combiner->dumpFull(); + BestCandidate->dumpFull();); + } +} + void AIEGlobalCombiner::calculateCombineCandidates( SUnit &CombineRoot, const GenericCombiner *Combiner) { assert(MDT); @@ -198,7 +222,7 @@ std::vector AIEGlobalCombiner::getCombineCandidates( // -------------------------- CombineCandidates ------------------------------// -std::vector CombineCandidates::searchCombinerSet( +std::vector CombineCandidates::searchCombinerSet( const std::vector> &OwnedCombineCandidates) { if (Combiners.empty()) @@ -299,7 +323,7 @@ std::vector CombineCandidates::searchCombinerSet( LLVM_DEBUG(dbgs() << "Search Result " << BestSolution.getGain() << "\n"); // Save best Candidate to FixedCombiners - std::vector Result; + std::vector Result; BitVector CombinerBitVec = BestSolution.getCombinersBitVector(); for (int Idx = CombinerBitVec.find_first(); Idx != -1; Idx = CombinerBitVec.find_next(Idx)) { @@ -517,6 +541,16 @@ void GenericCombiner::setGlobalID(unsigned GlobalID) { this->GlobalID = GlobalID; } +bool GenericCombiner::isReorderCandidate( + const GenericCombiner *Candidate) const { + return false; +} + +void GenericCombiner::copyInsertionPoint(const GenericCombiner *Candidate) { + CombinerData.InsertionPoint = Candidate->CombinerData.InsertionPoint; + InsertionPointNodeNum = Candidate->InsertionPointNodeNum; +} + /// \return whether a Combiner is used after a Remove-Combiner, that /// are part of the same Cluster. The Ordering of the Combiners \p A and \p B is /// irrelevant. diff --git a/llvm/lib/Target/AIE/AIEGlobalCombiner.h b/llvm/lib/Target/AIE/AIEGlobalCombiner.h index 955417ca0e2a..99e1a8532fee 100644 --- a/llvm/lib/Target/AIE/AIEGlobalCombiner.h +++ b/llvm/lib/Target/AIE/AIEGlobalCombiner.h @@ -256,6 +256,16 @@ class GenericCombiner { /// Set unique Identifier for this Combiner to \p GlobalID void setGlobalID(unsigned GlobalID); + + /// \return whether this Combiner could be moved before another Combiner + virtual bool canReorder() const = 0; + + /// \return whether \p Candidate is a ReorderCandidate, i.e. if this combiner + /// can be inserted right before \p Candidate + virtual bool isReorderCandidate(const GenericCombiner *Candidate) const; + + /// Set InsertionPoint of this combiner to the same as \p Candidate + void copyInsertionPoint(const GenericCombiner *Candidate); }; raw_ostream &operator<<(raw_ostream &OS, const GenericCombiner &Val); @@ -338,7 +348,7 @@ class CombineCandidates { /// \return Combiners from \p OwnedCombineCandidates that maximize the gain /// when applied - std::vector + std::vector searchCombinerSet(const std::vector> &OwnedCombineCandidates); @@ -396,6 +406,9 @@ class AIEGlobalCombiner { void calculateCombinerConflicts(); + void + reorderCombinerInsertions(std::vector &Combiners) const; + /// \return CombineCandidates sorted by highest potential gain std::vector getCombineCandidates( std::map ClusteredCombiners); diff --git a/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.cpp b/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.cpp index bd13c1506483..9966799c87a1 100644 --- a/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.cpp +++ b/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.cpp @@ -315,7 +315,25 @@ std::unique_ptr OffsetCombiner::clone() const { std::optional, std::vector>> OffsetCombiner::getInstructionsToMove(const AIE::DataDependenceHelper &DAG) { - return {{/*MoveUp*/ {}, /*MoveDown*/ {}}}; + auto *PtrAdd = getPtrInc(); + if (!getImm(*PtrAdd, *MRI)) { + /// Offset is not an immediate and the OffsetCombiner is not eligible for + /// reordering. + // Since the Offset already dominates the MemoryInstruction (where the + // insertion happens), no checks have to be performed. + return {{/*MoveUp=*/{}, /*MoveDown=*/{}}}; + } + + auto *SUnitPtrAdd = DAG.getSUnit(PtrAdd); + if (!SUnitPtrAdd) { + /// PtrAdd is an Immediate but it is outside of the MBB, so it already + /// dominates the MemoryInstruction. No checks have to be performed. + return {{/*MoveUp=*/{}, /*MoveDown=*/{}}}; + } + + /// Immediate Offset can be a reordering Candidate. Therefore, track Immediate + /// Offset, so it can be moved in case of a reordering. + return {{/*MoveUp=*/{SUnitPtrAdd}, /*MoveDown=*/{}}}; } void OffsetCombiner::adjustGain(const MachineDominatorTree &MDT) { @@ -335,7 +353,7 @@ void OffsetCombiner::adjustGain(const MachineDominatorTree &MDT) { Gain.setPtrMod(0); } - std::optional ImmOffset = getImm(*PtrAdd, *MRI); + ImmOffset = getImm(*PtrAdd, *MRI); if (!ImmOffset) return; @@ -363,6 +381,58 @@ std::optional OffsetCombiner::getOpCode(MachineInstr *PtrInc, return TII->getOffsetMemOpcode(MemI->getOpcode()); } +bool OffsetCombiner::canReorder() const { return ImmOffset.has_value(); } + +bool OffsetCombiner::isReorderCandidate( + const GenericCombiner *PostIncCombiner) const { + auto GetInputPtr = [&](const MachineInstr *PtrMod) { + auto InputPtrIdx = PtrModSupport.getInputPtrIdx(*PtrMod); + assert(InputPtrIdx); + return PtrMod->getOperand(*InputPtrIdx); + }; + + const PointerModifierCombiner *PtrModCombiner = + static_cast(PostIncCombiner); + if (!PtrModCombiner->isPostInc()) + return false; + + // only allow loads to be reordered + if (getMemI()->mayStore() || PtrModCombiner->getMemI()->mayStore()) + return false; + + // Same MBB check + auto *PtrAdd = getPtrInc(); + auto *PostIncPtrMod = PtrModCombiner->getPtrInc(); + if (PtrAdd->getParent() != PostIncPtrMod->getParent()) + return false; + + // Same Input Ptr Check + auto InputPtr = GetInputPtr(PtrAdd); + auto PostIncInputPtr = GetInputPtr(PostIncPtrMod); + if (!InputPtr.isIdenticalTo(PostIncInputPtr)) + return false; + + // Check if Store Instruction of Offset dominates PostInc + auto *MemI = getMemI(); + if (MemI->mayStore()) { + auto Source = MemI->getOperand(0); + assert(Source.isReg()); + auto *DefSource = MRI->getUniqueVRegDef(Source.getReg()); + if (!DefSource) + return false; + auto *DefSUnit = DAG->getSUnit(DefSource); + if (DefSUnit && + DefSUnit->NodeNum > PostIncCombiner->InsertionPointNodeNum) { + // Source of Offset-Store would be after the new InsertionPoint and thus + // generate invalid mir + return false; + } + } + + // OffsetCombiner occurs after PostIncCombiner + return InsertionPointNodeNum > PostIncCombiner->InsertionPointNodeNum; +} + // -------------------------- PostIncCombiner --------------------------------// bool PostIncCombiner::isCombineCandidate(MachineInstr &MemI, diff --git a/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.h b/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.h index f4d0612a32a5..7e8c73f25e0b 100644 --- a/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.h +++ b/llvm/lib/Target/AIE/AIEGlobalCombinerPtrMods.h @@ -118,9 +118,13 @@ class PointerModifierCombiner : public GenericCombiner { /// \return whether Opcode can be set bool tryToSetCombinedOpCode() override; + + virtual bool isPostInc() const = 0; }; class OffsetCombiner : public PointerModifierCombiner { + std::optional ImmOffset; + protected: std::optional getOpCode(MachineInstr *PtrInc, MachineInstr *MemI) const override; @@ -141,6 +145,12 @@ class OffsetCombiner : public PointerModifierCombiner { std::optional, std::vector>> getInstructionsToMove(const AIE::DataDependenceHelper &DAG) override; + + bool isReorderCandidate(const GenericCombiner *Candidate) const override; + + bool canReorder() const override; + + bool isPostInc() const override { return false; } }; class PostIncCombiner : public PointerModifierCombiner { @@ -176,6 +186,14 @@ class PostIncCombiner : public PointerModifierCombiner { void adjustGain(const MachineDominatorTree &MDT) override; std::vector getPtrInstrs(MachineInstr *MI) const override; + + bool isReorderCandidate(const GenericCombiner *Candidate) const override { + return false; + } + + bool canReorder() const override { return false; } + + bool isPostInc() const override { return true; } }; } // namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp b/llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp index 029b6a4688df..607982037a00 100644 --- a/llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp +++ b/llvm/lib/Target/AIE/AIEPtrModOptimizer.cpp @@ -167,6 +167,9 @@ void FoundCombiners::remapCombiner(AIE::Combiner &Combiner) const { getRemappedInstrs(Combiner.DelayInstrToInsertionPoint); Combiner.DelayInstrPastInsertionPoint = getRemappedInstrs(Combiner.DelayInstrPastInsertionPoint); + + std::vector InsertionPointVec = {Combiner.InsertionPoint}; + Combiner.InsertionPoint = getRemappedInstrs(InsertionPointVec)[0]; } const std::map & diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir index cf143612076a..7e1396f4a3f7 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/gemm.mir @@ -40,9 +40,9 @@ body: | ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(p0) = G_PHI [[COPY]](p0), %bb.0, %9(p0), %bb.1 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI]], [[C2]] ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[ADD]](s32), [[C]] - ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[PHI1]], [[C1]], [[C1]], [[C1]], [[C1]], [[C1]], [[C1]], [[C1]] :: (load (<32 x s16>)) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(<32 x s16>) = G_AIE_OFFSET_LOAD [[PHI1]](p0), [[C4]](s20) :: (load (<32 x s16>)) + ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(<32 x s16>), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[PHI1]], [[C1]], [[C1]], [[C1]], [[C1]], [[C1]], [[C1]], [[C1]] :: (load (<32 x s16>)) ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI1]], %configZero(s20) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s20) = G_CONSTANT i20 0 ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(<32 x s16>) = G_AIE_OFFSET_LOAD [[PTR_ADD]](p0), [[C5]](s20) :: (load (<32 x s16>)) diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir index d4ab16990614..ceaa853f0ac1 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/post-inc-eagerness.mir @@ -17,6 +17,7 @@ # increasing register pressure. # FIXME: reorder instructions +# Do not reorder Store Instructions. --- name: post-inc-reg-pressure-store legalized: true @@ -46,6 +47,7 @@ body: | PseudoRET implicit $lr, implicit %3 ... +# Reorder Load Instructions. # similar example as above, but with load instructions --- name: post-inc-reg-pressure-load @@ -60,9 +62,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: %lZero:_(s32), %7:_(p0), %8:_(s20), %9:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 ; CHECK-NEXT: %lOne:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C1]](s20) :: (load (s32)) + ; CHECK-NEXT: %lZero:_(s32), %7:_(p0), %8:_(s20), %9:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) ; CHECK-NEXT: PseudoRET implicit $lr, implicit %7(p0), implicit %lZero(s32), implicit %lOne(s32) %0:_(p0) = COPY $p0 %1:_(s20) = G_CONSTANT i20 64 diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir index 72097086bdc2..7ce34cc6791e 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/reorder-Mem-Instrs.mir @@ -28,8 +28,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 - ; CHECK-NEXT: %sZero:_(s32), %6:_(p0), %7:_(s20), %8:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) ; CHECK-NEXT: %sOne:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: %sZero:_(s32), %6:_(p0), %7:_(s20), %8:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD %sZero, %sOne ; CHECK-NEXT: G_STORE [[ADD]](s32), %6(p0) :: (store (s32)) ; CHECK-NEXT: PseudoRET implicit $lr diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir index 8f01580ae760..6508436ecdf9 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/global-combiners/user-intrinsics.mir @@ -29,10 +29,10 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 8 - ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) ; CHECK-NEXT: [[AIE_OFFSET_LOAD:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY]](p0), [[C]](s20) :: (load (s32)) - ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD4:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_LOAD5:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD6:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD7:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_LOAD1:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD2:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD3:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) ; CHECK-NEXT: [[AIE_OFFSET_LOAD1:%[0-9]+]]:_(s32) = G_AIE_OFFSET_LOAD [[COPY1]](p0), [[C]](s20) :: (load (s32)) + ; CHECK-NEXT: [[AIE_POSTINC_3D_LOAD4:%[0-9]+]]:_(s32), [[AIE_POSTINC_3D_LOAD5:%[0-9]+]]:_(p0), [[AIE_POSTINC_3D_LOAD6:%[0-9]+]]:_(s20), [[AIE_POSTINC_3D_LOAD7:%[0-9]+]]:_ = G_AIE_POSTINC_3D_LOAD [[COPY1]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]], [[C]] :: (load (s32)) ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_POSTINC_3D_LOAD]](s32), implicit [[AIE_OFFSET_LOAD]](s32), implicit [[AIE_POSTINC_3D_LOAD4]](s32), implicit [[AIE_OFFSET_LOAD1]](s32), implicit [[AIE_POSTINC_3D_LOAD1]](p0), implicit [[AIE_POSTINC_3D_LOAD5]](p0) %0:_(p0) = COPY $p0 %1:_(p0) = COPY $p1 diff --git a/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll b/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll index 4bae794dd7af..91b8b88d525a 100644 --- a/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll +++ b/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll @@ -16,16 +16,15 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-LABEL: test_load_store_unaligned: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova m0, #-560; nopb ; nopxm ; nops +; CHECK-NEXT: mova m0, #-560; nopb ; nopx ; CHECK-NEXT: paddxm [sp], #576 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: mov p2, sp -; CHECK-NEXT: st p6, [sp, #-572] // 4-byte Folded Spill -; CHECK-NEXT: mov p6, sp ; CHECK-NEXT: mov p5, sp ; CHECK-NEXT: st p7, [sp, #-576] // 4-byte Folded Spill ; CHECK-NEXT: mov p7, sp -; CHECK-NEXT: st r8, [sp, #-564] // 4-byte Folded Spill +; CHECK-NEXT: st p6, [sp, #-572] // 4-byte Folded Spill +; CHECK-NEXT: mov p6, sp ; CHECK-NEXT: st r9, [sp, #-568] // 4-byte Folded Spill ; CHECK-NEXT: padda [p0], m0 ; CHECK-NEXT: mova m0, #-544 @@ -39,36 +38,36 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: mova m0, #-480 ; CHECK-NEXT: padda [p0], m0 ; CHECK-NEXT: mova m0, #-416 -; CHECK-NEXT: mov r30, p0 -; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: padda [p0], m0 +; CHECK-NEXT: mov r31, p0 +; CHECK-NEXT: padda [p5], m0 ; CHECK-NEXT: mova m0, #-352 -; CHECK-NEXT: mov r9, p0 ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: padda [p6], m0 +; CHECK-NEXT: padda [p7], m0 ; CHECK-NEXT: mova m0, #-288 ; CHECK-NEXT: padda [p0], m0 ; CHECK-NEXT: mova m0, #-272 -; CHECK-NEXT: mov r27, p0 +; CHECK-NEXT: mov r28, p0 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: padda [p0], m0 ; CHECK-NEXT: mova m0, #-240 -; CHECK-NEXT: mov r26, p0 +; CHECK-NEXT: mov r27, p0 ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: padda [p5], m0 +; CHECK-NEXT: padda [p6], m0 ; CHECK-NEXT: mova m0, #-208 ; CHECK-NEXT: padda [p0], m0 ; CHECK-NEXT: mova m0, #-176 -; CHECK-NEXT: mov r25, p0 +; CHECK-NEXT: mov r26, p0 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: padda [p0], m0 ; CHECK-NEXT: mova m0, #-144 -; CHECK-NEXT: mov r24, p0 +; CHECK-NEXT: mov r25, p0 ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: padda [p7], m0 +; CHECK-NEXT: padda [p0], m0 ; CHECK-NEXT: mova m0, #-80 +; CHECK-NEXT: mov r9, p0 +; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: padda [p0], m0 -; CHECK-NEXT: mov r8, p0 +; CHECK-NEXT: mov r24, p0 ; CHECK-NEXT: mov p0, r17 ; CHECK-NEXT: st.s16 r0, [p0, #0] ; CHECK-NEXT: nop @@ -122,7 +121,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: st.s16 r7, [p0, #14] ; CHECK-NEXT: vextract.32 r7, x0, #7, vaddsign1 ; CHECK-NEXT: vmov wl0, q0 -; CHECK-NEXT: mov r28, p0 +; CHECK-NEXT: mov r29, p0 ; CHECK-NEXT: vextract.32 r0, x0, #0, vaddsign1 ; CHECK-NEXT: vextract.32 r1, x0, #1, vaddsign1 ; CHECK-NEXT: mov p0, r16 @@ -186,6 +185,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: vextract.16 r7, x4, #7, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj0, #8 +; CHECK-NEXT: st r8, [sp, #-564] // 4-byte Folded Spill ; CHECK-NEXT: st.s8 r16, [p2, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -287,10 +287,10 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: st.s16 r6, [p3, #12] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov r29, p0 +; CHECK-NEXT: mov r30, p0 ; CHECK-NEXT: vextract.16 r6, x6, #6, vaddsign1 ; CHECK-NEXT: vextract.32 r0, x8, #0, vaddsign1 -; CHECK-NEXT: mov p0, r30 +; CHECK-NEXT: mov p0, r31 ; CHECK-NEXT: st r0, [p0, #0] ; CHECK-NEXT: st.s16 r7, [p3, #14] ; CHECK-NEXT: nop @@ -334,7 +334,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: st r4, [p0, #16] ; CHECK-NEXT: st r5, [p0, #20] ; CHECK-NEXT: st.s16 r20, [p3, dj0] -; CHECK-NEXT: mov r30, p0 +; CHECK-NEXT: mov r31, p0 ; CHECK-NEXT: mova dj2, #60 ; CHECK-NEXT: vextract.16 r20, x6, #12, vaddsign1 ; CHECK-NEXT: vmov x0, bmll0 @@ -370,92 +370,90 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: st r0, [p0], #12 ; CHECK-NEXT: vextract.64 r5:r4, x0, #0, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: mov r31, p0 -; CHECK-NEXT: mov p0, r9 +; CHECK-NEXT: mov r8, p0 +; CHECK-NEXT: mov p0, r29 ; CHECK-NEXT: st r6, [p1, #24] ; CHECK-NEXT: st r7, [p1, #28] ; CHECK-NEXT: vextract.64 r7:r6, x0, #1, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: mov r9, p0 ; CHECK-NEXT: vextract.64 r17:r16, x0, #2, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r4, [p0, #0] -; CHECK-NEXT: st r5, [p0, #4] +; CHECK-NEXT: st r4, [p5, #0] +; CHECK-NEXT: st r5, [p5, #4] ; CHECK-NEXT: vextract.32 r4, x10, #4, vaddsign1 ; CHECK-NEXT: vextract.32 r5, x10, #5, vaddsign1 ; CHECK-NEXT: vextract.64 r19:r18, x0, #3, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r6, [p0, #8] -; CHECK-NEXT: st r7, [p0, #12] +; CHECK-NEXT: st r6, [p5, #8] +; CHECK-NEXT: st r7, [p5, #12] ; CHECK-NEXT: vextract.32 r6, x10, #6, vaddsign1 ; CHECK-NEXT: vextract.32 r7, x10, #7, vaddsign1 ; CHECK-NEXT: vextract.64 r21:r20, x0, #4, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r16, [p0, #16] -; CHECK-NEXT: st r17, [p0, #20] +; CHECK-NEXT: st r16, [p5, #16] +; CHECK-NEXT: st r17, [p5, #20] ; CHECK-NEXT: vextract.32 r16, x10, #8, vaddsign1 ; CHECK-NEXT: vextract.32 r17, x10, #9, vaddsign1 -; CHECK-NEXT: st r4, [p6, #16] -; CHECK-NEXT: st r5, [p6, #20] +; CHECK-NEXT: st r4, [p7, #16] +; CHECK-NEXT: st r5, [p7, #20] +; CHECK-NEXT: lda.s16 r4, [p0, #8] +; CHECK-NEXT: lda.s16 r5, [p0, #10] ; CHECK-NEXT: vextract.64 r23:r22, x0, #5, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r18, [p0, #24] -; CHECK-NEXT: st r19, [p0, #28] +; CHECK-NEXT: st r18, [p5, #24] +; CHECK-NEXT: st r19, [p5, #28] ; CHECK-NEXT: vextract.32 r18, x10, #10, vaddsign1 ; CHECK-NEXT: vextract.32 r19, x10, #11, vaddsign1 -; CHECK-NEXT: st r6, [p6, #24] -; CHECK-NEXT: st r7, [p6, #28] -; CHECK-NEXT: st r21, [p0, dj0] +; CHECK-NEXT: st r6, [p7, #24] +; CHECK-NEXT: st r7, [p7, #28] +; CHECK-NEXT: lda.s16 r6, [p0, #12] +; CHECK-NEXT: lda.s16 r7, [p0, #14] +; CHECK-NEXT: st r21, [p5, dj0] ; CHECK-NEXT: mova dj0, #56 -; CHECK-NEXT: st r17, [p6, dj5] +; CHECK-NEXT: st r17, [p7, dj5] ; CHECK-NEXT: mova dj5, #8 ; CHECK-NEXT: vextract.64 r3:r2, x0, #6, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: st r20, [p0, dj4] +; CHECK-NEXT: st r20, [p5, dj4] ; CHECK-NEXT: vextract.32 r20, x10, #12, vaddsign1 ; CHECK-NEXT: vextract.32 r21, x10, #13, vaddsign1 -; CHECK-NEXT: st r16, [p6, dj4] +; CHECK-NEXT: st r16, [p7, dj4] ; CHECK-NEXT: lda.s8 r16, [p2, dj5] ; CHECK-NEXT: mova dj5, #9 ; CHECK-NEXT: vextract.64 r1:r0, x0, #7, vaddsign1 -; CHECK-NEXT: st r22, [p0, dj6] -; CHECK-NEXT: st r23, [p0, dj7] +; CHECK-NEXT: st r22, [p5, dj6] +; CHECK-NEXT: st r23, [p5, dj7] ; CHECK-NEXT: vextract.32 r22, x10, #14, vaddsign1 ; CHECK-NEXT: vextract.32 r23, x10, #15, vaddsign1 -; CHECK-NEXT: st r18, [p6, dj6] -; CHECK-NEXT: st r19, [p6, dj7] +; CHECK-NEXT: st r18, [p7, dj6] +; CHECK-NEXT: st r19, [p7, dj7] ; CHECK-NEXT: lda.s8 r17, [p2, dj5] ; CHECK-NEXT: mova dj5, #10 -; CHECK-NEXT: st r2, [p0, dj1] -; CHECK-NEXT: st r3, [p0, dj3] +; CHECK-NEXT: st r2, [p5, dj1] +; CHECK-NEXT: st r3, [p5, dj3] ; CHECK-NEXT: vextract.32 r2, x10, #2, vaddsign1 ; CHECK-NEXT: vextract.32 r3, x10, #3, vaddsign1 -; CHECK-NEXT: st r20, [p6, dj1] -; CHECK-NEXT: st r21, [p6, dj3] -; CHECK-NEXT: st r0, [p0, dj0] -; CHECK-NEXT: st r1, [p0, dj2] -; CHECK-NEXT: mov p0, r28 +; CHECK-NEXT: st r20, [p7, dj1] +; CHECK-NEXT: st r21, [p7, dj3] ; CHECK-NEXT: lda.s8 r18, [p2, dj5] ; CHECK-NEXT: mova dj5, #11 +; CHECK-NEXT: st r0, [p5, dj0] +; CHECK-NEXT: st r1, [p5, dj2] ; CHECK-NEXT: vextract.32 r0, x10, #0, vaddsign1 ; CHECK-NEXT: vextract.32 r1, x10, #1, vaddsign1 -; CHECK-NEXT: st r22, [p6, dj0] -; CHECK-NEXT: st r23, [p6, dj2] +; CHECK-NEXT: st r22, [p7, dj0] +; CHECK-NEXT: st r23, [p7, dj2] ; CHECK-NEXT: lda.s8 r19, [p2, dj5] ; CHECK-NEXT: mova dj5, #12 -; CHECK-NEXT: st r2, [p6, #8] -; CHECK-NEXT: st r3, [p6, #12] +; CHECK-NEXT: st r2, [p7, #8] +; CHECK-NEXT: st r3, [p7, #12] ; CHECK-NEXT: lda.s16 r2, [p0, #4] ; CHECK-NEXT: lda.s16 r3, [p0, #6] -; CHECK-NEXT: lda.s16 r4, [p0, #8] -; CHECK-NEXT: lda.s16 r5, [p0, #10] -; CHECK-NEXT: lda.s16 r6, [p0, #12] -; CHECK-NEXT: lda.s16 r7, [p0, #14] -; CHECK-NEXT: st r0, [p6, #0] -; CHECK-NEXT: st r1, [p6, #4] +; CHECK-NEXT: st r0, [p7, #0] +; CHECK-NEXT: st r1, [p7, #4] ; CHECK-NEXT: lda.s16 r0, [p0, #0] ; CHECK-NEXT: lda.s16 r1, [p0, #2] -; CHECK-NEXT: mov p0, r27 +; CHECK-NEXT: mov p0, r28 ; CHECK-NEXT: lda.s8 r20, [p2, dj5] ; CHECK-NEXT: mova dj5, #13 ; CHECK-NEXT: lda.s8 r21, [p2, dj5] @@ -518,14 +516,14 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: mov p4, sp ; CHECK-NEXT: padda [p4], #-256 -; CHECK-NEXT: mov p0, r29 +; CHECK-NEXT: mov p0, r30 ; CHECK-NEXT: lda r0, [p0, #0] ; CHECK-NEXT: lda.s8 r6, [p2, #6] ; CHECK-NEXT: lda r1, [p0, #4] ; CHECK-NEXT: lda.s8 r7, [p2, #7] ; CHECK-NEXT: lda r2, [p0, #8] ; CHECK-NEXT: lda r3, [p0, #12] -; CHECK-NEXT: mov p0, r26 +; CHECK-NEXT: mov p0, r27 ; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: lda.s8 r0, [p2, #0] ; CHECK-NEXT: st r1, [p0], #4 @@ -662,15 +660,15 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj5, #30 ; CHECK-NEXT: lda.s16 r23, [p3, dj5] -; CHECK-NEXT: st.s16 r0, [p5], #2 +; CHECK-NEXT: st.s16 r0, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p0, r30 +; CHECK-NEXT: mov p0, r31 ; CHECK-NEXT: lda r0, [p0, #0] -; CHECK-NEXT: st.s16 r1, [p5], #2 +; CHECK-NEXT: st.s16 r1, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -678,7 +676,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: lda r1, [p0, #4] -; CHECK-NEXT: st.s16 r2, [p5], #2 +; CHECK-NEXT: st.s16 r2, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -686,7 +684,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: lda r2, [p0, #8] -; CHECK-NEXT: st.s16 r3, [p5], #2 +; CHECK-NEXT: st.s16 r3, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -694,7 +692,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: lda r3, [p0, #12] -; CHECK-NEXT: st.s16 r4, [p5], #2 +; CHECK-NEXT: st.s16 r4, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -702,7 +700,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: lda r4, [p0, #16] -; CHECK-NEXT: st.s16 r5, [p5], #2 +; CHECK-NEXT: st.s16 r5, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -710,7 +708,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: lda r5, [p0, #20] -; CHECK-NEXT: st.s16 r6, [p5], #2 +; CHECK-NEXT: st.s16 r6, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -718,7 +716,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: lda r6, [p0, #24] -; CHECK-NEXT: st.s16 r7, [p5], #2 +; CHECK-NEXT: st.s16 r7, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -726,10 +724,10 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: lda r7, [p0, #28] -; CHECK-NEXT: mov p0, r25 +; CHECK-NEXT: mov p0, r26 ; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: lda r0, [p1, #0] -; CHECK-NEXT: st.s16 r16, [p5], #2 +; CHECK-NEXT: st.s16 r16, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -738,7 +736,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: st r1, [p0], #4 ; CHECK-NEXT: lda r1, [p1, #4] -; CHECK-NEXT: st.s16 r17, [p5], #2 +; CHECK-NEXT: st.s16 r17, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -747,7 +745,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: st r2, [p0], #4 ; CHECK-NEXT: lda r2, [p1, #8] -; CHECK-NEXT: st.s16 r18, [p5], #2 +; CHECK-NEXT: st.s16 r18, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -755,7 +753,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r3, [p0], #4 -; CHECK-NEXT: st.s16 r19, [p5], #2 +; CHECK-NEXT: st.s16 r19, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -764,7 +762,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: st r4, [p0], #4 ; CHECK-NEXT: lda r4, [p1, #16] -; CHECK-NEXT: st.s16 r20, [p5], #2 +; CHECK-NEXT: st.s16 r20, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -772,7 +770,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: st r5, [p0], #4 -; CHECK-NEXT: st.s16 r21, [p5], #2 +; CHECK-NEXT: st.s16 r21, [p6], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -782,96 +780,96 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: st r6, [p0, #0] ; CHECK-NEXT: st r7, [p0, #4] ; CHECK-NEXT: lda r6, [p1, #24] -; CHECK-NEXT: lda r9, [sp, #-568] // 4-byte Folded Reload -; CHECK-NEXT: st.s16 r22, [p5, #0] +; CHECK-NEXT: lda r8, [sp, #-564] // 4-byte Folded Reload +; CHECK-NEXT: st.s16 r22, [p6, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p1, r9 -; CHECK-NEXT: mov p0, r31 +; CHECK-NEXT: mov p0, r8 +; CHECK-NEXT: nop ; CHECK-NEXT: lda r3, [p0], #8 -; CHECK-NEXT: lda r16, [p1, dj4] -; CHECK-NEXT: lda r18, [p1, dj6] -; CHECK-NEXT: lda r20, [p1, dj1] -; CHECK-NEXT: lda r22, [p1, dj0] -; CHECK-NEXT: st.s16 r23, [p5, #2] +; CHECK-NEXT: st.s16 r23, [p6, #2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: lda p6, [sp, #-572] // 4-byte Folded Reload ; CHECK-NEXT: lda r5, [p0, #0] ; CHECK-NEXT: lda r7, [p0, #8] -; CHECK-NEXT: mov p0, r24 +; CHECK-NEXT: mov p0, r25 ; CHECK-NEXT: st r0, [p0], #4 +; CHECK-NEXT: lda r0, [p5, #4] ; CHECK-NEXT: st r1, [p0], #4 -; CHECK-NEXT: lda r1, [p1, #4] +; CHECK-NEXT: lda r1, [p5, #8] ; CHECK-NEXT: st r2, [p0], #4 -; CHECK-NEXT: lda r2, [p1, #8] +; CHECK-NEXT: lda r2, [p5, #16] ; CHECK-NEXT: st r3, [p0], #4 +; CHECK-NEXT: lda r3, [p5, #24] ; CHECK-NEXT: st r4, [p0], #4 -; CHECK-NEXT: lda r4, [p1, #16] +; CHECK-NEXT: lda r4, [p5, dj4] ; CHECK-NEXT: st r5, [p0], #4 +; CHECK-NEXT: lda r5, [p5, dj6] ; CHECK-NEXT: st r6, [p0, #0] ; CHECK-NEXT: st r7, [p0, #4] -; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: lda r6, [p1, #24] -; CHECK-NEXT: lda r0, [p0], #12 -; CHECK-NEXT: lda r3, [p0], #8 -; CHECK-NEXT: lda r5, [p0], #8 -; CHECK-NEXT: lda r7, [p0], #8 -; CHECK-NEXT: lda r17, [p0], #8 -; CHECK-NEXT: lda r19, [p0], #8 -; CHECK-NEXT: lda r21, [p0, #0] -; CHECK-NEXT: lda r23, [p0, #8] -; CHECK-NEXT: mov p0, r8 -; CHECK-NEXT: lda r8, [sp, #-564] // 4-byte Folded Reload -; CHECK-NEXT: st r0, [p7], #4 -; CHECK-NEXT: lda r0, [p6, #0] -; CHECK-NEXT: st r1, [p7], #4 -; CHECK-NEXT: lda r1, [p6, #4] -; CHECK-NEXT: st r2, [p7], #4 -; CHECK-NEXT: lda r2, [p6, #8] -; CHECK-NEXT: st r3, [p7], #4 -; CHECK-NEXT: lda r3, [p6, #12] -; CHECK-NEXT: st r4, [p7], #4 -; CHECK-NEXT: lda r4, [p6, #16] -; CHECK-NEXT: st r5, [p7], #4 -; CHECK-NEXT: lda r5, [p6, #20] -; CHECK-NEXT: st r6, [p7], #4 -; CHECK-NEXT: lda r6, [p6, #24] -; CHECK-NEXT: st r7, [p7], #4 -; CHECK-NEXT: st r0, [p0], #4 -; CHECK-NEXT: lda r7, [p6, #28] -; CHECK-NEXT: st r16, [p7], #4 -; CHECK-NEXT: lda r16, [p6, dj4] +; CHECK-NEXT: mov p0, r9 +; CHECK-NEXT: lda r6, [p5, dj1] +; CHECK-NEXT: lda r7, [p5, dj0] +; CHECK-NEXT: lda r16, [p5], #12 +; CHECK-NEXT: lda r9, [sp, #-568] // 4-byte Folded Reload +; CHECK-NEXT: lda r17, [p5], #8 +; CHECK-NEXT: lda r18, [p5], #8 +; CHECK-NEXT: lda r19, [p5], #8 +; CHECK-NEXT: lda r20, [p5], #8 +; CHECK-NEXT: lda r21, [p5], #8 +; CHECK-NEXT: lda r22, [p5, #0] +; CHECK-NEXT: lda r23, [p5, #8] +; CHECK-NEXT: st r16, [p0], #4 +; CHECK-NEXT: lda r16, [p7, dj4] ; CHECK-NEXT: mova dj4, #36 +; CHECK-NEXT: st r0, [p0], #4 +; CHECK-NEXT: lda r0, [p7, #0] ; CHECK-NEXT: st r1, [p0], #4 -; CHECK-NEXT: st r17, [p7], #4 +; CHECK-NEXT: lda r1, [p7, #4] +; CHECK-NEXT: st r17, [p0], #4 +; CHECK-NEXT: lda r17, [p7, dj4] ; CHECK-NEXT: st r2, [p0], #4 -; CHECK-NEXT: lda r17, [p6, dj4] -; CHECK-NEXT: st r18, [p7], #4 +; CHECK-NEXT: lda r2, [p7, #8] +; CHECK-NEXT: st r18, [p0], #4 +; CHECK-NEXT: lda r18, [p7, dj6] ; CHECK-NEXT: st r3, [p0], #4 -; CHECK-NEXT: lda r18, [p6, dj6] -; CHECK-NEXT: st r19, [p7], #4 +; CHECK-NEXT: lda r3, [p7, #12] +; CHECK-NEXT: st r19, [p0], #4 +; CHECK-NEXT: lda r19, [p7, dj7] ; CHECK-NEXT: st r4, [p0], #4 -; CHECK-NEXT: lda r19, [p6, dj7] -; CHECK-NEXT: st r20, [p7], #4 +; CHECK-NEXT: lda r4, [p7, #16] +; CHECK-NEXT: st r20, [p0], #4 +; CHECK-NEXT: lda r20, [p7, dj1] ; CHECK-NEXT: st r5, [p0], #4 -; CHECK-NEXT: lda r20, [p6, dj1] -; CHECK-NEXT: st r21, [p7], #4 +; CHECK-NEXT: lda r5, [p7, #20] +; CHECK-NEXT: st r21, [p0], #4 +; CHECK-NEXT: lda r21, [p7, dj3] ; CHECK-NEXT: st r6, [p0], #4 -; CHECK-NEXT: lda r21, [p6, dj3] -; CHECK-NEXT: st r22, [p7, #0] -; CHECK-NEXT: st r23, [p7, #4] -; CHECK-NEXT: st r7, [p0], #4 -; CHECK-NEXT: lda r22, [p6, dj0] -; CHECK-NEXT: lda r23, [p6, dj2] +; CHECK-NEXT: lda r6, [p7, #24] +; CHECK-NEXT: st r22, [p0], #4 +; CHECK-NEXT: lda r22, [p7, dj0] +; CHECK-NEXT: st r7, [p0, #0] +; CHECK-NEXT: st r23, [p0, #4] +; CHECK-NEXT: mov p0, r24 +; CHECK-NEXT: lda r7, [p7, #28] +; CHECK-NEXT: lda r23, [p7, dj2] ; CHECK-NEXT: lda p7, [sp, #-576] // 4-byte Folded Reload -; CHECK-NEXT: lda p6, [sp, #-572] // 4-byte Folded Reload ; CHECK-NEXT: paddxm [sp], #-576 +; CHECK-NEXT: st r0, [p0], #4 +; CHECK-NEXT: st r1, [p0], #4 +; CHECK-NEXT: st r2, [p0], #4 +; CHECK-NEXT: st r3, [p0], #4 +; CHECK-NEXT: st r4, [p0], #4 +; CHECK-NEXT: st r5, [p0], #4 +; CHECK-NEXT: st r6, [p0], #4 +; CHECK-NEXT: st r7, [p0], #4 ; CHECK-NEXT: st r16, [p0], #4 ; CHECK-NEXT: st r17, [p0], #4 ; CHECK-NEXT: st r18, [p0], #4