From 5a53fd639e17afd269114856ce8e34113fd0d573 Mon Sep 17 00:00:00 2001 From: Valentijn van de Beek Date: Mon, 13 May 2024 16:36:27 +0100 Subject: [PATCH 1/9] [AIE2] Enable G_CONCAT_VECTOR optimizations for AIE2 --- .../Target/AIE/AIE2PreLegalizerCombiner.cpp | 4 + .../CodeGen/AIE/aie2/intrinsics-shufflevec.ll | 207 +++++------------- 2 files changed, 62 insertions(+), 149 deletions(-) diff --git a/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp b/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp index 37865902ad13..bf2969b00068 100644 --- a/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/IntrinsicsAIE2.h" #include "llvm/InitializePasses.h" @@ -167,6 +168,9 @@ bool AIE2PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { case TargetOpcode::G_INTRINSIC: { return tryToCombineIntrinsic(MI); } + case TargetOpcode::G_SHUFFLE_VECTOR: { + return Helper.tryCombineShuffleVector(MI); + } default: break; } diff --git a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll index f61e90620642..81d4d1905ac5 100644 --- a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll +++ b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll @@ -93,57 +93,36 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, < ; CHECK-LABEL: test_insert_vector: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; nopx ; mov r24, r16; nops -; CHECK-NEXT: mov r25, r17 -; CHECK-NEXT: mov r26, r18 -; CHECK-NEXT: mov r27, r19 -; CHECK-NEXT: mova r19, #0 -; CHECK-NEXT: mova r18, #1 -; CHECK-NEXT: mova r17, #2 -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r4, x4, r16 -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r1, x4, r19 -; CHECK-NEXT: vextract.s32 r2, x4, r18 -; CHECK-NEXT: vextract.s32 r3, x4, r17 -; CHECK-NEXT: vextract.s32 r5, x4, r16 -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r6, x4, r16 -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r7, x4, r16 -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r8, x4, r16 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 -; CHECK-NEXT: jz r0, #.LBB1_2 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 2 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 1 +; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv +; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: mov r24, r16 // Delay Slot 2 +; CHECK-NEXT: mova r16, #0 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end -; CHECK-NEXT: mova r16, #3; nopxm -; CHECK-NEXT: vextract.s32 r0, x2, r19 -; CHECK-NEXT: vextract.s32 r1, x0, r19 -; CHECK-NEXT: vextract.s32 r2, x2, r18 -; CHECK-NEXT: vextract.s32 r3, x0, r18 -; CHECK-NEXT: vextract.s32 r4, x2, r17 -; CHECK-NEXT: vextract.s32 r5, x0, r17 +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x2, r16; nopv +; CHECK-NEXT: vextract.s32 r1, x4, r16 +; CHECK-NEXT: mova r16, #1 +; CHECK-NEXT: vextract.s32 r2, x2, r16 +; CHECK-NEXT: vextract.s32 r3, x4, r16 +; CHECK-NEXT: mova r16, #2 +; CHECK-NEXT: vextract.s32 r4, x2, r16 +; CHECK-NEXT: vextract.s32 r5, x4, r16 +; CHECK-NEXT: mova r16, #3 ; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: vextract.s32 r7, x0, r16 +; CHECK-NEXT: vextract.s32 r7, x4, r16 ; CHECK-NEXT: mova r16, #4 ; CHECK-NEXT: vextract.s32 r8, x2, r16 -; CHECK-NEXT: vextract.s32 r9, x0, r16 +; CHECK-NEXT: vextract.s32 r9, x4, r16 ; CHECK-NEXT: mova r16, #5 ; CHECK-NEXT: vextract.s32 r10, x2, r16 -; CHECK-NEXT: vextract.s32 r11, x0, r16 +; CHECK-NEXT: vextract.s32 r11, x4, r16 ; CHECK-NEXT: mova r16, #7 ; CHECK-NEXT: vextract.s32 r12, x2, r16 -; CHECK-NEXT: vextract.s32 r13, x0, r16 +; CHECK-NEXT: vextract.s32 r13, x4, r16 ; CHECK-NEXT: mova r16, #6 ; CHECK-NEXT: vextract.s32 r14, x2, r16 -; CHECK-NEXT: vextract.s32 r15, x0, r16 +; CHECK-NEXT: vextract.s32 r15, x4, r16 ; CHECK-NEXT: vpush.lo.32 x0, r13, x0 ; CHECK-NEXT: vpush.lo.32 x0, r15, x0 ; CHECK-NEXT: vpush.lo.32 x0, r11, x0 @@ -155,34 +134,37 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, < ; CHECK-NEXT: vpush.lo.32 x0, r12, x0 ; CHECK-NEXT: vpush.lo.32 x0, r14, x0 ; CHECK-NEXT: vpush.lo.32 x0, r10, x0 -; CHECK-NEXT: j #.LBB1_3 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 2 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 1 +; CHECK-NEXT: vpush.lo.32 x0, r8, x0 +; CHECK-NEXT: ret lr +; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5 +; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 +; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3 +; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 +; CHECK-NEXT: mov r16, r24 // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: // %if.then -; CHECK-NEXT: nopb ; mova r16, #3; nops ; nopxm ; nopv -; CHECK-NEXT: vextract.s32 r0, x0, r19 -; CHECK-NEXT: vextract.s32 r1, x2, r19 -; CHECK-NEXT: vextract.s32 r2, x0, r18 -; CHECK-NEXT: vextract.s32 r3, x2, r18 -; CHECK-NEXT: vextract.s32 r4, x0, r17 -; CHECK-NEXT: vextract.s32 r5, x2, r17 -; CHECK-NEXT: vextract.s32 r6, x0, r16 +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x4, r16; nopv +; CHECK-NEXT: vextract.s32 r1, x2, r16 +; CHECK-NEXT: mova r16, #1 +; CHECK-NEXT: vextract.s32 r2, x4, r16 +; CHECK-NEXT: vextract.s32 r3, x2, r16 +; CHECK-NEXT: mova r16, #2 +; CHECK-NEXT: vextract.s32 r4, x4, r16 +; CHECK-NEXT: vextract.s32 r5, x2, r16 +; CHECK-NEXT: mova r16, #3 +; CHECK-NEXT: vextract.s32 r6, x4, r16 ; CHECK-NEXT: vextract.s32 r7, x2, r16 ; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r8, x0, r16 +; CHECK-NEXT: vextract.s32 r8, x4, r16 ; CHECK-NEXT: vextract.s32 r9, x2, r16 ; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r10, x0, r16 +; CHECK-NEXT: vextract.s32 r10, x4, r16 ; CHECK-NEXT: vextract.s32 r11, x2, r16 ; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r12, x0, r16 +; CHECK-NEXT: vextract.s32 r12, x4, r16 ; CHECK-NEXT: vextract.s32 r13, x2, r16 ; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r14, x0, r16 +; CHECK-NEXT: vextract.s32 r14, x4, r16 ; CHECK-NEXT: vextract.s32 r15, x2, r16 ; CHECK-NEXT: vpush.lo.32 x0, r13, x0 ; CHECK-NEXT: vpush.lo.32 x0, r15, x0 @@ -196,17 +178,11 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, < ; CHECK-NEXT: vpush.lo.32 x0, r14, x0 ; CHECK-NEXT: vpush.lo.32 x0, r10, x0 ; CHECK-NEXT: vpush.lo.32 x0, r8, x0 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB1_3: // %cleanup -; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mov r19, r27 // Delay Slot 4 -; CHECK-NEXT: mov r18, r26 // Delay Slot 3 -; CHECK-NEXT: mov r17, r25 // Delay Slot 2 +; CHECK-NEXT: ret lr +; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5 +; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 +; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3 +; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 ; CHECK-NEXT: mov r16, r24 // Delay Slot 1 entry: %shuffle = shufflevector <8 x i32> %b, <8 x i32> undef, <16 x i32> @@ -230,49 +206,12 @@ define <16 x i32> @test_concat_vector(<8 x i32> noundef %a, <8 x i32> noundef %b ; CHECK-LABEL: test_concat_vector: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov r24, r16 -; CHECK-NEXT: mova r16, #0 -; CHECK-NEXT: vextract.s32 r0, x2, r16 -; CHECK-NEXT: vextract.s32 r1, x4, r16 -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: vextract.s32 r3, x4, r16 -; CHECK-NEXT: mova r16, #2 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: vextract.s32 r5, x4, r16 -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: vextract.s32 r7, x4, r16 -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r8, x2, r16 -; CHECK-NEXT: vextract.s32 r9, x4, r16 -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r10, x2, r16 -; CHECK-NEXT: vextract.s32 r11, x4, r16 -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r12, x2, r16 -; CHECK-NEXT: vextract.s32 r13, x4, r16 -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r14, x2, r16 -; CHECK-NEXT: vextract.s32 r15, x4, r16 -; CHECK-NEXT: vpush.lo.32 x0, r13, x0 -; CHECK-NEXT: vpush.lo.32 x0, r15, x0 -; CHECK-NEXT: vpush.lo.32 x0, r11, x0 -; CHECK-NEXT: vpush.lo.32 x0, r9, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 -; CHECK-NEXT: vpush.lo.32 x0, r12, x0 -; CHECK-NEXT: vpush.lo.32 x0, r14, x0 -; CHECK-NEXT: vpush.lo.32 x0, r10, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r24 // Delay Slot 1 +; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 3 +; CHECK-NEXT: vmov wh0, wl4 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 entry: %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> ret <16 x i32> %shuffle @@ -282,44 +221,14 @@ define <16 x i32> @test_set_vector(i32 noundef %idx, <8 x i32> noundef %a) { ; CHECK-LABEL: test_set_vector: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopb ; nopx ; mov r9, r16 -; CHECK-NEXT: mova r16, #0 +; CHECK-NEXT: mov r1, r16 ; CHECK-NEXT: eqz r0, r0 -; CHECK-NEXT: vextract.s32 r1, x2, r16 -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: mova r16, #2 -; CHECK-NEXT: vextract.s32 r3, x2, r16 -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r5, x2, r16 -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r7, x2, r16 -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r8, x2, r16 -; CHECK-NEXT: add r16, r0, #-1 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 -; CHECK-NEXT: vpush.lo.32 x2, r0, x0 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 3 -; CHECK-NEXT: vsel.32 x0, x0, x2, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r9 // Delay Slot 1 +; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 5 +; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 4 +; CHECK-NEXT: add r16, r0, #-1 // Delay Slot 3 +; CHECK-NEXT: vsel.32 x0, x0, x0, r16 // Delay Slot 2 +; CHECK-NEXT: mov r16, r1 // Delay Slot 1 entry: %cmp = icmp eq i32 %idx, 0 %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> From 432c40956eacce4b2d4b50b2767ed284291ba503 Mon Sep 17 00:00:00 2001 From: Valentijn van de Beek Date: Fri, 20 Sep 2024 10:43:49 +0200 Subject: [PATCH 2/9] [GISel][CombinerHelper] Add stream generator type for combiners These generators are used to match onto shufflemask for optimizations. The idea is that each shufflemask essentially encodes a function that turns one vector into another. Generators are those functions and allow us to match shufflevectors by generating masks. Since masks are frequently very similar, this allows to define many masks in relatively few lines. --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index b2132562ac3f..a74bc64e987f 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -27,6 +27,7 @@ #include "llvm/CodeGenTypes/LowLevelType.h" #include "llvm/IR/InstrTypes.h" #include +#include namespace llvm { @@ -246,6 +247,11 @@ class CombinerHelper { void applyCombineShuffleConcat(MachineInstr &MI, SmallVector &Ops); /// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS. + /// A function type that returns either the next value in a + /// shufflemask or an empty value. Each iteration should return + /// one value, like a Python iterator or a Lisp stream. + using GeneratorType = std::function()>; + /// Returns true if MI changed. /// /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR. From 0cd7ef6f833b5d1515869ff7983309699527aef6 Mon Sep 17 00:00:00 2001 From: Valentijn van de Beek Date: Fri, 20 Sep 2024 10:47:39 +0200 Subject: [PATCH 3/9] [GISel][CombinerHelper] Counter stream that counts from 0 to N We check for iterative shift masks which corresponds to the CONCAT_VECTOR instruction. --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index ec7ca5dc8e2b..3ef63451bf91 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -42,6 +42,8 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include +#include +#include #include #include @@ -384,6 +386,19 @@ void CombinerHelper::applyCombineShuffleConcat(MachineInstr &MI, MI.eraseFromParent(); } +// Create a stream from 0 to n with a specified number of steps +CombinerHelper::GeneratorType +adderGenerator(const int32_t From, const int32_t To, const int32_t StepSize) { + int32_t Counter = From; + return [Counter, To, StepSize]() mutable { + std::optional OldCount = std::optional(Counter); + Counter += StepSize; + if (OldCount == (To + StepSize)) + OldCount = {}; + return OldCount; + }; +} + bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { SmallVector Ops; if (matchCombineShuffleVector(MI, Ops)) { From b5e22cc71154151de608edd8d8ab6c6c3d1566ac Mon Sep 17 00:00:00 2001 From: Valentijn van de Beek Date: Thu, 15 Aug 2024 09:18:11 +0100 Subject: [PATCH 4/9] [GISel][CombinerHelper] Use a stream to check for G_CONCAT_VECTOR We check for iterative shift masks which corresponds to the CONCAT_VECTOR instruction. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 13 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 91 ++-- .../prelegalizercombiner-shufflevector.mir | 430 ++++++++++++++++++ 3 files changed, 492 insertions(+), 42 deletions(-) create mode 100644 llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index a74bc64e987f..aba31d183b6c 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -246,24 +246,23 @@ class CombinerHelper { /// or an implicit_def if \p Ops is empty. void applyCombineShuffleConcat(MachineInstr &MI, SmallVector &Ops); - /// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS. /// A function type that returns either the next value in a /// shufflemask or an empty value. Each iteration should return /// one value, like a Python iterator or a Lisp stream. using GeneratorType = std::function()>; + /// Try to combine G_SHUFFLE_VECTOR into more efficient opcodes. /// Returns true if MI changed. /// /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR. bool tryCombineShuffleVector(MachineInstr &MI); - /// Check if the G_SHUFFLE_VECTOR \p MI can be replaced by a - /// concat_vectors. - /// \p Ops will contain the operands needed to produce the flattened - /// concat_vectors. + /// Check if the G_SHUFFLE_VECTOR \p MI can be replaced by checking + /// whether the shufflemask given matches that of a given generator. /// /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR. - bool matchCombineShuffleVector(MachineInstr &MI, - SmallVectorImpl &Ops); + bool matchCombineShuffleVector(MachineInstr &MI, GeneratorType Generator, + const size_t TargetDstSize); + /// Replace \p MI with a concat_vectors with \p Ops. void applyCombineShuffleVector(MachineInstr &MI, const ArrayRef Ops); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 3ef63451bf91..e8a9f519be71 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -400,16 +400,64 @@ adderGenerator(const int32_t From, const int32_t To, const int32_t StepSize) { } bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { + const Register DstReg = MI.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + const unsigned DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1; + const unsigned SrcNumElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1; + + // {1, 2, ..., n} -> G_CONCAT_VECTOR + // Turns a shuffle vector that only increments into a concat vector + // instruction + GeneratorType CountUp = adderGenerator(0, DstNumElts - 1, 1); SmallVector Ops; - if (matchCombineShuffleVector(MI, Ops)) { + + if (matchCombineShuffleVector(MI, CountUp, 2 * SrcNumElts)) { + // The shuffle is concatenating multiple vectors together. + // Collect the different operands for that. + Register UndefReg; + const Register Src1 = MI.getOperand(1).getReg(); + const Register Src2 = MI.getOperand(2).getReg(); + + const ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + + // The destination can be longer than the source, so we separate them into + // equal blocks and check them separately to see if one of the blocks can be + // copied whole. + unsigned NumConcat = DstNumElts / SrcNumElts; + unsigned Index = 0; + for (unsigned Concat = 0; Concat < NumConcat; Concat++) { + unsigned Target = (Concat + 1) * SrcNumElts; + while (Index < Target) { + int MaskElt = Mask[Index]; + if (MaskElt >= 0) { + Ops.push_back((MaskElt < (int)SrcNumElts) ? Src1 : Src2); + break; + } + Index++; + } + + if (Index == Target) { + if (!UndefReg) { + Builder.setInsertPt(*MI.getParent(), MI); + UndefReg = Builder.buildUndef(SrcTy).getReg(0); + } + Ops.push_back(UndefReg); + } + + Index = Target; + } + applyCombineShuffleVector(MI, Ops); return true; } + return false; } bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI, - SmallVectorImpl &Ops) { + GeneratorType Generator, + const size_t TargetDstSize) { assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && "Invalid instruction kind"); LLT DstType = MRI.getType(MI.getOperand(0).getReg()); @@ -436,51 +484,24 @@ bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI, // // TODO: If the size between the source and destination don't match // we could still emit an extract vector element in that case. - if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1) + if ((DstNumElts < TargetDstSize) && DstNumElts != 1) return false; - // Check that the shuffle mask can be broken evenly between the - // different sources. - if (DstNumElts % SrcNumElts != 0) - return false; - - // Mask length is a multiple of the source vector length. - // Check if the shuffle is some kind of concatenation of the input - // vectors. - unsigned NumConcat = DstNumElts / SrcNumElts; - SmallVector ConcatSrcs(NumConcat, -1); ArrayRef Mask = MI.getOperand(3).getShuffleMask(); for (unsigned i = 0; i != DstNumElts; ++i) { int Idx = Mask[i]; + const int32_t ShiftIndex = Generator().value_or(-1); + // Undef value. - if (Idx < 0) + if (Idx < 0 || ShiftIndex < 0) continue; + // Ensure the indices in each SrcType sized piece are sequential and that // the same source is used for the whole piece. - if ((Idx % SrcNumElts != (i % SrcNumElts)) || - (ConcatSrcs[i / SrcNumElts] >= 0 && - ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts))) + if ((Idx % SrcNumElts != (ShiftIndex % SrcNumElts))) return false; - // Remember which source this index came from. - ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts; } - // The shuffle is concatenating multiple vectors together. - // Collect the different operands for that. - Register UndefReg; - Register Src2 = MI.getOperand(2).getReg(); - for (auto Src : ConcatSrcs) { - if (Src < 0) { - if (!UndefReg) { - Builder.setInsertPt(*MI.getParent(), MI); - UndefReg = Builder.buildUndef(SrcType).getReg(0); - } - Ops.push_back(UndefReg); - } else if (Src == 0) - Ops.push_back(Src1); - else - Ops.push_back(Src2); - } return true; } diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir new file mode 100644 index 000000000000..0fe7b0735743 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir @@ -0,0 +1,430 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aie2 -run-pass=aie2-prelegalizer-combiner %s -verify-machineinstrs -o - | FileCheck %s + +--- +name: concat_vector_32_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_1024 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: concat_vector_32_1024 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[COPY]](<16 x s32>), [[COPY1]](<16 x s32>) + ; CHECK-NEXT: $y2 = COPY [[CONCAT_VECTORS]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $y2 + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<32 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31) + $y2 = COPY %0:_(<32 x s32>) + PseudoRET implicit $lr, implicit $y2 +... + +--- +name: concat_vector_32_256 +legalized: false +body: | + bb.1.entry: + liveins: $wl0 + ; CHECK-LABEL: name: concat_vector_32_256 + ; CHECK: liveins: $wl0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[UV1]](<4 x s32>), [[UV]](<4 x s32>) + ; CHECK-NEXT: $wl0 = COPY [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $wl0 + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<4 x s32>), %3:_(<4 x s32>) = G_UNMERGE_VALUES %1:_(<8 x s32>) + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %3:_(<4 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7) + $wl0 = COPY %0:_(<8 x s32>) + PseudoRET implicit $lr, implicit $wl0 +... + +--- +name: concat_vector_16_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_16_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s16>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s16>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[COPY]](<16 x s16>), [[COPY1]](<16 x s16>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<32 x s16>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<16 x s16>) = COPY $wl2 + %2:_(<16 x s16>) = COPY $wl4 + %0:_(<32 x s16>) = G_SHUFFLE_VECTOR %1:_(<16 x s16>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31) + $x0 = COPY %0:_(<32 x s16>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_8_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_8_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s8>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<32 x s8>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s8>) = G_CONCAT_VECTORS [[COPY]](<32 x s8>), [[COPY1]](<32 x s8>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<32 x s8>) = COPY $wl2 + %2:_(<32 x s8>) = COPY $wl4 + %0:_(<64 x s8>) = G_SHUFFLE_VECTOR %1:_(<32 x s8>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63) + $x0 = COPY %0:_(<64 x s8>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_second_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_second_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_first_start +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_first_start + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_first_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_first_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_second_start +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_second_start + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_first_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_first_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[DEF]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_second_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_second_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[DEF]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_random +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_random + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, -1, 2, -1, 4, -1, -1, 7, 8, 9, -1, 11, 12, -1, 14, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_undef_start_first +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_undef_start_first + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_start_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_start_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_end_start +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_end_start + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_end_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_end_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_first_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_first_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[DEF]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_second_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_second_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[DEF]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_random +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_random + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, -1, 11, 12, 13, -1, 15, 0, 1, -1, 3, 4, 5, -1, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... From d3f07f5144e5b1ac6f4a8adb32fe444d27fc3f34 Mon Sep 17 00:00:00 2001 From: Valentijn van de Beek Date: Fri, 20 Sep 2024 11:34:42 +0200 Subject: [PATCH 5/9] [NIT] Remove some useless newlines and add a newline for the test --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index e8a9f519be71..4457d7c7bed0 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -418,7 +418,6 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { Register UndefReg; const Register Src1 = MI.getOperand(1).getReg(); const Register Src2 = MI.getOperand(2).getReg(); - const ArrayRef Mask = MI.getOperand(3).getShuffleMask(); // The destination can be longer than the source, so we separate them into @@ -451,7 +450,6 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { applyCombineShuffleVector(MI, Ops); return true; } - return false; } From 8d757e7cb0ffe92ad25e8a88ce620f3da8cb7f1d Mon Sep 17 00:00:00 2001 From: Valentijn van de Beek Date: Mon, 23 Sep 2024 23:17:40 +0200 Subject: [PATCH 6/9] [GISel][CombinerHelper] Add a helper that unmerges a vector to a target size --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 10 ++++ .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 48 +++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index aba31d183b6c..5d8bc0ae452b 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -263,6 +263,16 @@ class CombinerHelper { bool matchCombineShuffleVector(MachineInstr &MI, GeneratorType Generator, const size_t TargetDstSize); + /// Create G_UNMERGE_VECTOR instructions until the source has reached a + /// target vector size. + /// + /// Requires that the destination fits evenly in the source register. It + /// allows you to pass which of the different destination sized slices + /// you require. + Register createUnmergeValue(MachineInstr &MI, const Register SrcReg, + const Register DstReg, uint8_t DestinationIndex, + const uint32_t Start, const uint32_t End); + /// Replace \p MI with a concat_vectors with \p Ops. void applyCombineShuffleVector(MachineInstr &MI, const ArrayRef Ops); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 4457d7c7bed0..6a69851cfea6 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -399,6 +399,54 @@ adderGenerator(const int32_t From, const int32_t To, const int32_t StepSize) { }; } +Register CombinerHelper::createUnmergeValue( + MachineInstr &MI, const Register SrcReg, const Register DstReg, + const uint8_t DestinationIndex, const uint32_t Start, const uint32_t End) { + Builder.setInsertPt(*MI.getParent(), MI); + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + assert((DstTy.isScalar() || + (SrcTy.getNumElements() % DstTy.getNumElements()) == 0) && + "destination vector must divide source cleanly"); + + const unsigned HalfElements = SrcTy.getNumElements() / 2; + const LLT ScalarTy = SrcTy.getScalarType(); + const LLT HalfSizeTy = (HalfElements == 1) + ? ScalarTy + : LLT::fixed_vector(HalfElements, ScalarTy); + const Register TmpReg = MRI.createGenericVirtualRegister(HalfSizeTy); + Register TargetReg = DstReg; + if (DstTy != HalfSizeTy) { + TargetReg = MRI.createGenericVirtualRegister(HalfSizeTy); + } + + // Each destination fits n times into the source and each iteration we exactly + // half the source. Therefore we need to pick on which side we want to iterate + // on. + const uint32_t DstNumElements = DstTy.isVector() ? DstTy.getNumElements() : 1; + const uint32_t HalfWay = Start + ((End - Start) / 2); + const uint32_t Position = DestinationIndex * DstNumElements; + + uint32_t NextStart, NextEnd; + if (Position < HalfWay) { + Builder.buildInstr(TargetOpcode::G_UNMERGE_VALUES, {TargetReg, TmpReg}, + {SrcReg}); + NextStart = Start; + NextEnd = HalfWay; + } else { + Builder.buildInstr(TargetOpcode::G_UNMERGE_VALUES, {TmpReg, TargetReg}, + {SrcReg}); + NextStart = HalfWay; + NextEnd = End; + } + + if (HalfSizeTy.isVector() && DstTy != HalfSizeTy) + return createUnmergeValue(MI, TargetReg, DstReg, DestinationIndex, + NextStart, NextEnd); + + return DstReg; +} + bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { const Register DstReg = MI.getOperand(0).getReg(); const LLT DstTy = MRI.getType(DstReg); From e4b0f01872786f2e00af107e0aa2526a01c3e9a0 Mon Sep 17 00:00:00 2001 From: Valentijn van de Beek Date: Mon, 23 Sep 2024 23:16:44 +0200 Subject: [PATCH 7/9] [GISel][CombinerHelper] Add matcher code for unmerging the first half of vector A and B --- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 43 +++++ llvm/test/CodeGen/AArch64/ext-narrow-index.ll | 8 +- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 13 +- .../prelegalizercombiner-shufflevector.mir | 153 ++++++++++++++++++ .../CodeGen/AIE/aie2/intrinsics-shufflevec.ll | 65 ++------ 5 files changed, 214 insertions(+), 68 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 6a69851cfea6..102460631379 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -449,11 +449,20 @@ Register CombinerHelper::createUnmergeValue( bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { const Register DstReg = MI.getOperand(0).getReg(); + const Register SrcReg1 = MI.getOperand(1).getReg(); + const Register SrcReg2 = MI.getOperand(2).getReg(); + const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + const unsigned DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1; const unsigned SrcNumElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1; + // This test is a bit silly, but it is required because some tests rely on + // the legalizer changing the type of the shufflevector. + if (DstTy.getScalarSizeInBits() == 1) + return false; + // {1, 2, ..., n} -> G_CONCAT_VECTOR // Turns a shuffle vector that only increments into a concat vector // instruction @@ -498,6 +507,40 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { applyCombineShuffleVector(MI, Ops); return true; } + + // {1, 2, ..., |DstVector|} -> G_UNMERGE_VALUES + // Extracts the first chunk of the same size of the destination vector from + // the source + GeneratorType FirstQuarter = adderGenerator(0, DstNumElts - 1, 1); + if (matchCombineShuffleVector(MI, FirstQuarter, DstNumElts - 1)) { + // This optimization does not work if the target type is not a multiple of + // two, this can happen in some backends that support uneven vector types. + // We also need to make sure that the vector can be split into two. + if (SrcTy == DstTy || ((SrcNumElts / 2) % 2) != 0 || + SrcNumElts % DstNumElts != 0) + return false; + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + const Register TargetReg = Mask[0] < (int)SrcNumElts ? SrcReg1 : SrcReg2; + createUnmergeValue(MI, TargetReg, DstReg, 0, 0, SrcNumElts); + MI.eraseFromParent(); + return true; + } + + // {|DstVector|, |DstVector|+1, ..., 2 * |DstVector|} -> G_UNMERGE_VALUES + // Extracts the second chunk of the same size of the destination vector from + // the source + GeneratorType SecondQuarter = + adderGenerator(DstNumElts, (DstNumElts * 2) - 1, 1); + if (matchCombineShuffleVector(MI, SecondQuarter, DstNumElts - 1)) { + if (((SrcNumElts / 2) % 2) != 0 || SrcNumElts % DstNumElts != 0) + return false; + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + const Register TargetReg = Mask[0] < (int)SrcNumElts ? SrcReg1 : SrcReg2; + createUnmergeValue(MI, TargetReg, DstReg, 1, 0, SrcNumElts); + MI.eraseFromParent(); + return true; + } + return false; } diff --git a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll index 2c5d33da93c8..6f095c59f2a6 100644 --- a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll +++ b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-SD ; RUN: llc < %s -global-isel -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-GISEL +; Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates ; Tests of shufflevector where the index operand is half the width of the vector ; operands. We should get one ext instruction and not two. @@ -42,8 +43,7 @@ define <8 x i8> @i8_off8(<16 x i8> %arg1, <16 x i8> %arg2) { ; ; CHECK-GISEL-LABEL: i8_off8: ; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GISEL-NEXT: mov d0, v0.d[1] ; CHECK-GISEL-NEXT: ret entry: %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> @@ -254,9 +254,7 @@ define <8 x i8> @i8_zero_off8(<16 x i8> %arg1) { ; ; CHECK-GISEL-LABEL: i8_zero_off8: ; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: movi v1.2d, #0000000000000000 -; CHECK-GISEL-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GISEL-NEXT: mov d0, v0.d[1] ; CHECK-GISEL-NEXT: ret entry: %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 3254c5ebe9c6..0ef0e6e22922 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-DOT ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT +; Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates define i32 @addv_v2i32(<2 x i32> %a) { ; CHECK-LABEL: addv_v2i32: @@ -3744,17 +3745,13 @@ define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i1 ; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-GI-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: ushll v6.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 ; CHECK-GI-NEXT: ushll v7.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: add v0.4s, v4.4s, v0.4s -; CHECK-GI-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-GI-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-GI-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-GI-NEXT: uaddw2 v0.4s, v4.4s, v0.8h +; CHECK-GI-NEXT: uaddw2 v1.4s, v5.4s, v1.8h +; CHECK-GI-NEXT: uaddw2 v2.4s, v6.4s, v2.8h +; CHECK-GI-NEXT: uaddw2 v3.4s, v7.4s, v3.8h ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir index 0fe7b0735743..082554d3ade3 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir @@ -133,6 +133,25 @@ body: | PseudoRET implicit $lr, implicit $x0 ... +--- +name: extract_vector_1024_to_512 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_512 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: $x0 = COPY [[UV]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<32 x s32>) = COPY $y2 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %1:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + --- name: concat_vector_32_512_first_start legalized: false @@ -154,6 +173,26 @@ body: | PseudoRET implicit $lr, implicit $x0 ... +--- +name: extract_vector_1024_to_256 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_256 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[UV]](<16 x s32>) + ; CHECK-NEXT: $wl0 = COPY [[UV2]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<32 x s32>) = COPY $y2 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %1:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7) + $wl0 = COPY %0:_(<8 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + --- name: concat_vector_32_512_first_end legalized: false @@ -428,3 +467,117 @@ body: | $x0 = COPY %0:_(<16 x s32>) PseudoRET implicit $lr, implicit $x0 ... + +--- +name: extract_vector_1024_to_128 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_128 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[UV]](<16 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[UV2]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_UNPAD_VECTOR]](<4 x s32>) + %1:_(<32 x s32>) = COPY $y2 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %1:_, shufflemask(0, 1, 2, 3) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_1024_to_32 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_32 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<32 x s8>), [[UV3:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[UV]](<64 x s8>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_AIE_UNPAD_VECTOR [[UV2]](<32 x s8>) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<8 x s8>), [[UV5:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[AIE_UNPAD_VECTOR]](<16 x s8>) + ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s8>), [[UV7:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[UV4]](<8 x s8>) + ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<2 x s8>), [[UV9:%[0-9]+]]:_(<2 x s8>) = G_UNMERGE_VALUES [[UV6]](<4 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV8]](<2 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %0:_(<2 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_, shufflemask(0, 1) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_second_half_512_to_256 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_second_half_512_to_256 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %1:_(<16 x s32>), shufflemask(8, 9, 10, 11, 12, 13, 14, 15) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: extract_vector_second_half_512_to_128 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_second_half_512_to_128 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[UV]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV3]](<4 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %1:_(<16 x s32>), shufflemask(4, 5, 6, 7) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: extract_vector_second_half_1024_to_512 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_second_half_1024_to_512 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<64 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<64 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: extract_vector_second_half_1024_to_32 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_second_half_1024_to_32 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<32 x s8>), [[UV3:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[UV]](<64 x s8>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_AIE_UNPAD_VECTOR [[UV2]](<32 x s8>) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<8 x s8>), [[UV5:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[AIE_UNPAD_VECTOR]](<16 x s8>) + ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s8>), [[UV7:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[UV4]](<8 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV7]](<4 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<4 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(4, 5, 6, 7) + PseudoRET implicit $lr, implicit %2 +... diff --git a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll index 81d4d1905ac5..eda80653683b 100644 --- a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll +++ b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll @@ -15,63 +15,18 @@ define <8 x i32> @test_extract_vector(<16 x i32> noundef %a, i32 noundef %idx) { ; CHECK-NEXT: nopa ; nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: mov r8, r16 // Delay Slot 1 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end -; CHECK-NEXT: mova r16, #8; nopb ; nopxm -; CHECK-NEXT: vextract.s32 r0, x2, r16 -; CHECK-NEXT: mova r16, #9 -; CHECK-NEXT: vextract.s32 r1, x2, r16 -; CHECK-NEXT: mova r16, #10 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: mova r16, #11 -; CHECK-NEXT: vextract.s32 r3, x2, r16 -; CHECK-NEXT: mova r16, #12 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: mova r16, #13 -; CHECK-NEXT: vextract.s32 r5, x2, r16 -; CHECK-NEXT: mova r16, #15 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: mova r16, #14 -; CHECK-NEXT: vextract.s32 r7, x2, r16 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r8 // Delay Slot 1 +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wl0, wh0; nopv ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_2: // %if.then -; CHECK-NEXT: mova r16, #0; nopb ; nopxm -; CHECK-NEXT: vextract.s32 r0, x2, r16 -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: vextract.s32 r1, x2, r16 -; CHECK-NEXT: mova r16, #2 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r3, x2, r16 -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r5, x2, r16 -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r7, x2, r16 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r8 // Delay Slot 1 +; CHECK-NEXT: .LBB0_2: // %return +; CHECK-NEXT: nopa ; ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 entry: %cmp = icmp eq i32 %idx, 0 br i1 %cmp, label %if.then, label %if.end From 38729a7086ae28e1bdd4f13b9500741c5a9920de Mon Sep 17 00:00:00 2001 From: Valentijn van de Beek Date: Mon, 13 May 2024 16:18:33 +0100 Subject: [PATCH 8/9] [GISel][CombinerHelper] Add a function that chains a list of generators together --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 102460631379..4cdd3d553eb4 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -399,6 +399,21 @@ adderGenerator(const int32_t From, const int32_t To, const int32_t StepSize) { }; } +// Move to the next generator if it is exhausted allowing to chain generators +CombinerHelper::GeneratorType +concatGenerators(SmallVector &Generators) { + auto *GeneratorIterator = Generators.begin(); + + return [GeneratorIterator, Generators]() mutable { + std::optional GenValue = (*GeneratorIterator)(); + if (!GenValue.has_value() && GeneratorIterator != Generators.end()) { + GeneratorIterator++; + GenValue = (*GeneratorIterator)(); + } + return GenValue; + }; +} + Register CombinerHelper::createUnmergeValue( MachineInstr &MI, const Register SrcReg, const Register DstReg, const uint8_t DestinationIndex, const uint32_t Start, const uint32_t End) { From ebe64890817721fa7c68cb8e51ed88c35afe9648 Mon Sep 17 00:00:00 2001 From: Valentijn van de Beek Date: Fri, 14 Jun 2024 12:32:10 +0100 Subject: [PATCH 9/9] [GISel][CombinerHelper] Add a combiner to concatenate the first halfs of two vectors together --- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 38 +++ .../GlobalISel/combine-shufflevector.mir | 9 +- .../prelegalizercombiner-shuffle-vector.mir | 7 +- llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 73 ++--- .../prelegalizercombiner-shufflevector.mir | 270 ++++++++++++++++++ .../CodeGen/AIE/aie2/intrinsics-shufflevec.ll | 98 +------ 6 files changed, 353 insertions(+), 142 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 4cdd3d553eb4..2b5571ed5682 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -556,6 +556,44 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { return true; } + // After this point, it is assumed our shufflevectors work on vectors that can + // be splint into two + if ((DstNumElts % 2) != 0) + return false; + + // {1, 2, ..., n/4, n/2, n/2+1, .... 3n/4} -> G_UNMERGE_VALUES + // Take the first halfs of the two vectors and concatenate them into one + // vector. + GeneratorType FirstEightA = adderGenerator(0, (DstNumElts / 2) - 1, 1); + GeneratorType FirstEightB = + adderGenerator(DstNumElts, DstNumElts + (DstNumElts / 2) - 1, 1); + + auto UnmergeMatcher = SmallVector{FirstEightA, FirstEightB}; + GeneratorType FirstAndThird = concatGenerators(UnmergeMatcher); + if (matchCombineShuffleVector(MI, FirstAndThird, (DstNumElts / 2) - 1)) { + if (DstNumElts <= 2) + return false; + const Register DstReg = MI.getOperand(0).getReg(); + const LLT HalfSrcTy = + LLT::fixed_vector(SrcNumElts / 2, SrcTy.getScalarType()); + const Register HalfOfA = createUnmergeValue( + MI, MI.getOperand(1).getReg(), + MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts); + const Register HalfOfB = createUnmergeValue( + MI, MI.getOperand(2).getReg(), + MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts); + + const ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + if (Mask[0] <= 0) { + Builder.buildMergeLikeInstr(DstReg, {HalfOfA, HalfOfB}); + } else { + Builder.buildMergeLikeInstr(DstReg, {HalfOfB, HalfOfA}); + } + + MI.eraseFromParent(); + return true; + } + return false; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir index 0de989f8be75..b87fdf8bc552 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s +# Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates --- name: shuffle_concat_1 @@ -101,7 +102,9 @@ body: | ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %a(<4 x s8>), %b(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) - ; CHECK-NEXT: %z:_(<16 x s8>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<16 x s8>), [[CONCAT_VECTORS1]], shufflemask(0, undef, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, undef, undef, undef, undef) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<16 x s8>) + ; CHECK-NEXT: %z:_(<16 x s8>) = G_CONCAT_VECTORS [[UV]](<8 x s8>), [[UV2]](<8 x s8>) ; CHECK-NEXT: $q0 = COPY %z(<16 x s8>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %p1:_(p0) = COPY $x0 @@ -179,7 +182,9 @@ body: | ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %a(<4 x s8>), %b(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) - ; CHECK-NEXT: %z:_(<16 x s8>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<16 x s8>), [[CONCAT_VECTORS1]], shufflemask(undef, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, undef, undef, undef, undef) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<16 x s8>) + ; CHECK-NEXT: %z:_(<16 x s8>) = G_CONCAT_VECTORS [[UV]](<8 x s8>), [[UV2]](<8 x s8>) ; CHECK-NEXT: $q0 = COPY %z(<16 x s8>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %p1:_(p0) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir index 2c9ae5b06b62..1d4651fe70b5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple aarch64-apple-ios -run-pass=aarch64-prelegalizer-combiner %s -o - | FileCheck %s +# Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates # Check that we canonicalize shuffle_vector(Src1, Src2, mask(0,1,2,3)) # into concat_vector(Src1, Src2). @@ -270,8 +271,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 - ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(4, 5, 0, 1) - ; CHECK-NEXT: RET_ReallyLR implicit [[SHUF]](<4 x s32>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x s32>), [[UV3:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV2]](<2 x s32>), [[UV]](<2 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit [[CONCAT_VECTORS]](<4 x s32>) %0:_(<4 x s32>) = COPY $q0 %1:_(<4 x s32>) = COPY $q1 %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1(<4 x s32>), shufflemask(4,5,0,1) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index 749d6071c98d..89002fc9de43 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefixes=CHECK,CHECK-SD ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI +; Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates ; CHECK-GI: warning: Instruction selection used fallback path for test_bitcastv2f32tov1f64 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_bitcastv1f64tov2f32 @@ -1776,19 +1777,10 @@ entry: } define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 { -; CHECK-SD-LABEL: test_concat_v16i8_v16i8_v16i8: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v16i8_v16i8_v16i8: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI126_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI126_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> ret <16 x i8> %vecinit30 @@ -1803,9 +1795,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v16i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI127_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov b2, v0.b[1] ; CHECK-GI-NEXT: mov b3, v0.b[2] ; CHECK-GI-NEXT: mov b4, v0.b[3] @@ -1814,14 +1804,13 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; CHECK-GI-NEXT: mov b7, v0.b[6] ; CHECK-GI-NEXT: mov b16, v0.b[7] ; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI127_0] ; CHECK-GI-NEXT: mov v0.b[2], v3.b[0] ; CHECK-GI-NEXT: mov v0.b[3], v4.b[0] ; CHECK-GI-NEXT: mov v0.b[4], v5.b[0] ; CHECK-GI-NEXT: mov v0.b[5], v6.b[0] ; CHECK-GI-NEXT: mov v0.b[6], v7.b[0] ; CHECK-GI-NEXT: mov v0.b[7], v16.b[0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1999,19 +1988,10 @@ entry: } define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 { -; CHECK-SD-LABEL: test_concat_v8i16_v8i16_v8i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v8i16_v8i16_v8i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI130_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI130_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> ret <8 x i16> %vecinit14 @@ -2026,17 +2006,14 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v8i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI131_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov h2, v0.h[1] ; CHECK-GI-NEXT: mov h3, v0.h[2] ; CHECK-GI-NEXT: mov h4, v0.h[3] ; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI131_0] ; CHECK-GI-NEXT: mov v0.h[2], v3.h[0] ; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -2142,19 +2119,10 @@ entry: } define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 { -; CHECK-SD-LABEL: test_concat_v4i32_v4i32_v4i32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v4i32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI134_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI134_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> ret <4 x i32> %vecinit6 @@ -2169,13 +2137,10 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI135_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov s2, v0.s[1] ; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI135_0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <2 x i32> %x, i32 0 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir index 082554d3ade3..d14ac147679e 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir @@ -581,3 +581,273 @@ body: | %2:_(<4 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(4, 5, 6, 7) PseudoRET implicit $lr, implicit %2 ... + +--- +name: extract_vector_third_half_1024 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_third_half_1024 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<16 x s32>) + %1:_(<32 x s32>) = COPY $y2 + %2:_(<32 x s32>) = COPY $y3 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %2:_, shufflemask(32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_512 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_third_half_512 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(16, 17, 18, 19, 20, 21, 22, 23) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_256 +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: extract_vector_third_half_256 + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_UNPAD_VECTOR]](<4 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_128 +legalized: false +body: | + bb.1.entry: + liveins: $q0, $q1 + ; CHECK-LABEL: name: extract_vector_third_half_128 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<2 x s32>) + %1:_(<4 x s32>) = COPY $q0 + %2:_(<4 x s32>) = COPY $q1 + %0:_(<2 x s32>) = G_SHUFFLE_VECTOR %1:_(<4 x s32>), %2:_, shufflemask(4, 5) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_1024 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_fourth_half_1024 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<16 x s32>) + %1:_(<32 x s32>) = COPY $y2 + %2:_(<32 x s32>) = COPY $y3 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %2:_, shufflemask(48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_512 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_fourth_half_512 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(24,25,26,27,28,29,30,31) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_256 +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: extract_vector_fourth_half_256 + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[COPY]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<4 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(12,13,14,15) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_128 +legalized: false +body: | + bb.1.entry: + liveins: $q0, $q1 + ; CHECK-LABEL: name: extract_vector_fourth_half_128 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<2 x s32>) + %1:_(<4 x s32>) = COPY $q0 + %2:_(<4 x s32>) = COPY $q1 + %0:_(<2 x s32>) = G_SHUFFLE_VECTOR %1:_(<4 x s32>), %2:_, shufflemask(6,7) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: insert_vector_16_elements +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: insert_vector_16_elements + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV]](<8 x s32>), [[UV2]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_8_elements +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: insert_vector_8_elements + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR]](<4 x s32>), [[AIE_UNPAD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(0, 1, 2, 3, 8, 9, 10, 11) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_128_elements +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: insert_vector_128_elements + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV]](<64 x s8>), [[UV2]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<128 x s8>) = COPY $y3 + %3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_16_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: insert_vector_16_elements_reverse + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV2]](<8 x s32>), [[UV]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_8_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: insert_vector_8_elements_reverse + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR1]](<4 x s32>), [[AIE_UNPAD_VECTOR]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(8, 9, 10, 11, 0, 1, 2, 3) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_128_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: insert_vector_128_elements_reverse + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV2]](<64 x s8>), [[UV]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<128 x s8>) = COPY $y3 + %3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63) + PseudoRET implicit $lr, implicit %3 +... diff --git a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll index eda80653683b..0284bbbe9d7f 100644 --- a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll +++ b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll @@ -49,96 +49,26 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, < ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv -; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: nopa ; nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov r24, r16 // Delay Slot 2 -; CHECK-NEXT: mova r16, #0 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: vmov wl0, wl4 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x2, r16; nopv -; CHECK-NEXT: vextract.s32 r1, x4, r16 -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: vextract.s32 r3, x4, r16 -; CHECK-NEXT: mova r16, #2 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: vextract.s32 r5, x4, r16 -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: vextract.s32 r7, x4, r16 -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r8, x2, r16 -; CHECK-NEXT: vextract.s32 r9, x4, r16 -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r10, x2, r16 -; CHECK-NEXT: vextract.s32 r11, x4, r16 -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r12, x2, r16 -; CHECK-NEXT: vextract.s32 r13, x4, r16 -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r14, x2, r16 -; CHECK-NEXT: vextract.s32 r15, x4, r16 -; CHECK-NEXT: vpush.lo.32 x0, r13, x0 -; CHECK-NEXT: vpush.lo.32 x0, r15, x0 -; CHECK-NEXT: vpush.lo.32 x0, r11, x0 -; CHECK-NEXT: vpush.lo.32 x0, r9, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 -; CHECK-NEXT: vpush.lo.32 x0, r12, x0 -; CHECK-NEXT: vpush.lo.32 x0, r14, x0 -; CHECK-NEXT: vpush.lo.32 x0, r10, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r24 // Delay Slot 1 +; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: vmov wh2, wl0 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: // %if.then -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s32 r0, x4, r16; nopv -; CHECK-NEXT: vextract.s32 r1, x2, r16 -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: vextract.s32 r2, x4, r16 -; CHECK-NEXT: vextract.s32 r3, x2, r16 -; CHECK-NEXT: mova r16, #2 -; CHECK-NEXT: vextract.s32 r4, x4, r16 -; CHECK-NEXT: vextract.s32 r5, x2, r16 -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r6, x4, r16 -; CHECK-NEXT: vextract.s32 r7, x2, r16 -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r8, x4, r16 -; CHECK-NEXT: vextract.s32 r9, x2, r16 -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r10, x4, r16 -; CHECK-NEXT: vextract.s32 r11, x2, r16 -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r12, x4, r16 -; CHECK-NEXT: vextract.s32 r13, x2, r16 -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r14, x4, r16 -; CHECK-NEXT: vextract.s32 r15, x2, r16 -; CHECK-NEXT: vpush.lo.32 x0, r13, x0 -; CHECK-NEXT: vpush.lo.32 x0, r15, x0 -; CHECK-NEXT: vpush.lo.32 x0, r11, x0 -; CHECK-NEXT: vpush.lo.32 x0, r9, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 -; CHECK-NEXT: vpush.lo.32 x0, r12, x0 -; CHECK-NEXT: vpush.lo.32 x0, r14, x0 -; CHECK-NEXT: vpush.lo.32 x0, r10, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r24 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 entry: %shuffle = shufflevector <8 x i32> %b, <8 x i32> undef, <16 x i32> %cmp = icmp eq i32 %idx, 0